Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3import importlib

4from codecs import IncrementalDecoder

5from collections import Counter

6from functools import lru_cache

7from typing import Counter as TypeCounter

9from .constant import (

10 FREQUENCIES,

11 KO_NAMES,

12 LANGUAGE_SUPPORTED_COUNT,

13 TOO_SMALL_SEQUENCE,

14 ZH_NAMES,

15 _FREQUENCIES_SET,

16 _FREQUENCIES_RANK,

17)

18from .md import is_suspiciously_successive_range

19from .models import CoherenceMatches

20from .utils import (

21 is_accentuated,

22 is_latin,

23 is_multi_byte_encoding,

24 is_unicode_range_secondary,

25 unicode_range,

26)

29def encoding_unicode_range(iana_name: str) -> list[str]:

30 """

31 Return associated unicode ranges in a single byte code page.

32 """

33 if is_multi_byte_encoding(iana_name):

34 raise OSError( # Defensive:

35 "Function not supported on multi-byte code page"

36 )

38 decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder

40 p: IncrementalDecoder = decoder(errors="ignore")

41 seen_ranges: dict[str, int] = {}

42 character_count: int = 0

44 for i in range(0x40, 0xFF):

45 chunk: str = p.decode(bytes([i]))

47 if chunk:

48 character_range: str | None = unicode_range(chunk)

50 if character_range is None:

51 continue

53 if is_unicode_range_secondary(character_range) is False:

54 if character_range not in seen_ranges:

55 seen_ranges[character_range] = 0

56 seen_ranges[character_range] += 1

57 character_count += 1

59 return sorted(

60 [

61 character_range

62 for character_range in seen_ranges

63 if seen_ranges[character_range] / character_count >= 0.15

64 ]

65 )

68def unicode_range_languages(primary_range: str) -> list[str]:

69 """

70 Return inferred languages used with a unicode range.

71 """

72 languages: list[str] = []

74 for language, characters in FREQUENCIES.items():

75 for character in characters:

76 if unicode_range(character) == primary_range:

77 languages.append(language)

78 break

80 return languages

83@lru_cache()

84def encoding_languages(iana_name: str) -> list[str]:

85 """

86 Single-byte encoding language association. Some code page are heavily linked to particular language(s).

87 This function does the correspondence.

88 """

89 unicode_ranges: list[str] = encoding_unicode_range(iana_name)

90 primary_range: str | None = None

92 for specified_range in unicode_ranges:

93 if "Latin" not in specified_range:

94 primary_range = specified_range

95 break

97 if primary_range is None:

98 return ["Latin Based"]

100 return unicode_range_languages(primary_range)

101

102

103@lru_cache()

104def mb_encoding_languages(iana_name: str) -> list[str]:

105 """

106 Multi-byte encoding language association. Some code page are heavily linked to particular language(s).

107 This function does the correspondence.

108 """

109 if (

110 iana_name.startswith("shift_")

111 or iana_name.startswith("iso2022_jp")

112 or iana_name.startswith("euc_j")

113 or iana_name == "cp932"

114 ):

115 return ["Japanese"]

116 if iana_name.startswith("gb") or iana_name in ZH_NAMES:

117 return ["Chinese"]

118 if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:

119 return ["Korean"]

120

121 return []

122

123

124@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)

125def get_target_features(language: str) -> tuple[bool, bool]:

126 """

127 Determine main aspects from a supported language if it contains accents and if is pure Latin.

128 """

129 target_have_accents: bool = False

130 target_pure_latin: bool = True

131

132 for character in FREQUENCIES[language]:

133 if not target_have_accents and is_accentuated(character):

134 target_have_accents = True

135 if target_pure_latin and is_latin(character) is False:

136 target_pure_latin = False

137

138 return target_have_accents, target_pure_latin

139

140

141def alphabet_languages(

142 characters: list[str], ignore_non_latin: bool = False

143) -> list[str]:

144 """

145 Return associated languages associated to given characters.

146 """

147 languages: list[tuple[str, float]] = []

148

149 characters_set: frozenset[str] = frozenset(characters)

150 source_have_accents = any(is_accentuated(character) for character in characters)

151

152 for language, language_characters in FREQUENCIES.items():

153 target_have_accents, target_pure_latin = get_target_features(language)

154

155 if ignore_non_latin and target_pure_latin is False:

156 continue

157

158 if target_have_accents is False and source_have_accents:

159 continue

160

161 character_count: int = len(language_characters)

162

163 character_match_count: int = len(_FREQUENCIES_SET[language] & characters_set)

164

165 ratio: float = character_match_count / character_count

166

167 if ratio >= 0.2:

168 languages.append((language, ratio))

169

170 languages = sorted(languages, key=lambda x: x[1], reverse=True)

171

172 return [compatible_language[0] for compatible_language in languages]

173

174

175def characters_popularity_compare(

176 language: str, ordered_characters: list[str]

177) -> float:

178 """

179 Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.

180 The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).

181 Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)

182 """

183 if language not in FREQUENCIES:

184 raise ValueError(f"{language} not available") # Defensive:

185

186 character_approved_count: int = 0

187 frequencies_language_set: frozenset[str] = _FREQUENCIES_SET[language]

188 lang_rank: dict[str, int] = _FREQUENCIES_RANK[language]

189

190 ordered_characters_count: int = len(ordered_characters)

191 target_language_characters_count: int = len(FREQUENCIES[language])

192

193 large_alphabet: bool = target_language_characters_count > 26

194

195 expected_projection_ratio: float = (

196 target_language_characters_count / ordered_characters_count

197 )

198

199 # Pre-built rank dict for ordered_characters (avoids repeated list slicing).

200 ordered_rank: dict[str, int] = {

201 char: rank for rank, char in enumerate(ordered_characters)

202 }

203

204 # Pre-compute characters common to both orderings.

205 # Avoids repeated `c in ordered_rank` dict lookups in the inner counts.

206 common_chars: list[tuple[int, int]] = [

207 (lr, ordered_rank[c]) for c, lr in lang_rank.items() if c in ordered_rank

208 ]

209

210 # Pre-extract lr and orr arrays for faster iteration in the inner loop.

211 # Plain integer loops with local arrays are much faster under mypyc than

212 # generator expression sums over a list of tuples.

213 common_count: int = len(common_chars)

214 common_lr: list[int] = [p[0] for p in common_chars]

215 common_orr: list[int] = [p[1] for p in common_chars]

216

217 for character, character_rank in zip(

218 ordered_characters, range(0, ordered_characters_count)

219 ):

220 if character not in frequencies_language_set:

221 continue

222

223 character_rank_in_language: int = lang_rank[character]

224 character_rank_projection: int = int(character_rank * expected_projection_ratio)

225

226 if (

227 large_alphabet is False

228 and abs(character_rank_projection - character_rank_in_language) > 4

229 ):

230 continue

231

232 if (

233 large_alphabet is True

234 and abs(character_rank_projection - character_rank_in_language)

235 < target_language_characters_count / 3

236 ):

237 character_approved_count += 1

238 continue

239

240 # Count how many characters appear "before" in both orderings,

241 # and how many appear "at or after" in both orderings.

242 # Single pass over pre-extracted arrays — much faster under mypyc

243 # than two generator expression sums.

244 before_match_count: int = 0

245 after_match_count: int = 0

246 for i in range(common_count):

247 lr_i: int = common_lr[i]

248 orr_i: int = common_orr[i]

249 if lr_i < character_rank_in_language:

250 if orr_i < character_rank:

251 before_match_count += 1

252 else:

253 if orr_i >= character_rank:

254 after_match_count += 1

255

256 after_len: int = target_language_characters_count - character_rank_in_language

257

258 if character_rank_in_language == 0 and before_match_count <= 4:

259 character_approved_count += 1

260 continue

261

262 if after_len == 0 and after_match_count <= 4:

263 character_approved_count += 1

264 continue

265

266 if (

267 character_rank_in_language > 0

268 and before_match_count / character_rank_in_language >= 0.4

269 ) or (after_len > 0 and after_match_count / after_len >= 0.4):

270 character_approved_count += 1

271 continue

272

273 return character_approved_count / len(ordered_characters)

274

275

276def alpha_unicode_split(decoded_sequence: str) -> list[str]:

277 """

278 Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.

279 Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;

280 One containing the latin letters and the other hebrew.

281 """

282 layers: dict[str, list[str]] = {}

283

284 # Fast path: track single-layer key to skip dict iteration for single-script text.

285 single_layer_key: str | None = None

286 multi_layer: bool = False

287

288 # Cache the last character_range and its resolved layer to avoid repeated

289 # is_suspiciously_successive_range calls for consecutive same-range chars.

290 prev_character_range: str | None = None

291 prev_layer_target: str | None = None

292

293 for character in decoded_sequence:

294 if character.isalpha() is False:

295 continue

296

297 # ASCII fast-path: a-z and A-Z are always "Basic Latin".

298 # Avoids unicode_range() function call overhead for the most common case.

299 character_ord: int = ord(character)

300 if character_ord < 128:

301 character_range: str | None = "Basic Latin"

302 else:

303 character_range = unicode_range(character)

304

305 if character_range is None:

306 continue

307

308 # Fast path: same range as previous character → reuse cached layer target.

309 if character_range == prev_character_range:

310 if prev_layer_target is not None:

311 layers[prev_layer_target].append(character)

312 continue

313

314 layer_target_range: str | None = None

315

316 if multi_layer:

317 for discovered_range in layers:

318 if (

319 is_suspiciously_successive_range(discovered_range, character_range)

320 is False

321 ):

322 layer_target_range = discovered_range

323 break

324 elif single_layer_key is not None:

325 if (

326 is_suspiciously_successive_range(single_layer_key, character_range)

327 is False

328 ):

329 layer_target_range = single_layer_key

330

331 if layer_target_range is None:

332 layer_target_range = character_range

333

334 if layer_target_range not in layers:

335 layers[layer_target_range] = []

336 if single_layer_key is None:

337 single_layer_key = layer_target_range

338 else:

339 multi_layer = True

340

341 layers[layer_target_range].append(character)

342

343 # Cache for next iteration

344 prev_character_range = character_range

345 prev_layer_target = layer_target_range

346

347 return ["".join(chars).lower() for chars in layers.values()]

348

349

350def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:

351 """

352 This function merge results previously given by the function coherence_ratio.

353 The return type is the same as coherence_ratio.

354 """

355 per_language_ratios: dict[str, list[float]] = {}

356 for result in results:

357 for sub_result in result:

358 language, ratio = sub_result

359 if language not in per_language_ratios:

360 per_language_ratios[language] = [ratio]

361 continue

362 per_language_ratios[language].append(ratio)

363

364 merge = [

365 (

366 language,

367 round(

368 sum(per_language_ratios[language]) / len(per_language_ratios[language]),

369 4,

370 ),

371 )

372 for language in per_language_ratios

373 ]

374

375 return sorted(merge, key=lambda x: x[1], reverse=True)

376

377

378def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:

379 """

380 We shall NOT return "English—" in CoherenceMatches because it is an alternative

381 of "English". This function only keeps the best match and remove the em-dash in it.

382 """

383 index_results: dict[str, list[float]] = dict()

384

385 for result in results:

386 language, ratio = result

387 no_em_name: str = language.replace("—", "")

388

389 if no_em_name not in index_results:

390 index_results[no_em_name] = []

391

392 index_results[no_em_name].append(ratio)

393

394 if any(len(index_results[e]) > 1 for e in index_results):

395 filtered_results: CoherenceMatches = []

396

397 for language in index_results:

398 filtered_results.append((language, max(index_results[language])))

399

400 return filtered_results

401

402 return results

403

404

405@lru_cache(maxsize=2048)

406def coherence_ratio(

407 decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None

408) -> CoherenceMatches:

409 """

410 Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.

411 A layer = Character extraction by alphabets/ranges.

412 """

413

414 results: list[tuple[str, float]] = []

415 ignore_non_latin: bool = False

416

417 sufficient_match_count: int = 0

418

419 lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []

420 if "Latin Based" in lg_inclusion_list:

421 ignore_non_latin = True

422 lg_inclusion_list.remove("Latin Based")

423

424 for layer in alpha_unicode_split(decoded_sequence):

425 sequence_frequencies: TypeCounter[str] = Counter(layer)

426 most_common = sequence_frequencies.most_common()

427

428 character_count: int = len(layer)

429

430 if character_count <= TOO_SMALL_SEQUENCE:

431 continue

432

433 popular_character_ordered: list[str] = [c for c, o in most_common]

434

435 for language in lg_inclusion_list or alphabet_languages(

436 popular_character_ordered, ignore_non_latin

437 ):

438 ratio: float = characters_popularity_compare(

439 language, popular_character_ordered

440 )

441

442 if ratio < threshold:

443 continue

444 elif ratio >= 0.8:

445 sufficient_match_count += 1

446

447 results.append((language, round(ratio, 4)))

448

449 if sufficient_match_count >= 3:

450 break

451

452 return sorted(

453 filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True

454 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/cd.py: 97%

220 statements