Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3from encodings.aliases import aliases

4from json import dumps

5from re import sub

6from typing import Any, Iterator, List, Tuple

8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE

9from .utils import iana_name, is_multi_byte_encoding, unicode_range

12class CharsetMatch:

13 def __init__(

14 self,

15 payload: bytes,

16 guessed_encoding: str,

17 mean_mess_ratio: float,

18 has_sig_or_bom: bool,

19 languages: CoherenceMatches,

20 decoded_payload: str | None = None,

21 preemptive_declaration: str | None = None,

22 ):

23 self._payload: bytes = payload

25 self._encoding: str = guessed_encoding

26 self._mean_mess_ratio: float = mean_mess_ratio

27 self._languages: CoherenceMatches = languages

28 self._has_sig_or_bom: bool = has_sig_or_bom

29 self._unicode_ranges: list[str] | None = None

31 self._leaves: list[CharsetMatch] = []

32 self._mean_coherence_ratio: float = 0.0

34 self._output_payload: bytes | None = None

35 self._output_encoding: str | None = None

37 self._string: str | None = decoded_payload

39 self._preemptive_declaration: str | None = preemptive_declaration

41 def __eq__(self, other: object) -> bool:

42 if not isinstance(other, CharsetMatch):

43 if isinstance(other, str):

44 return iana_name(other) == self.encoding

45 return False

46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

48 def __lt__(self, other: object) -> bool:

49 """

50 Implemented to make sorted available upon CharsetMatches items.

51 """

52 if not isinstance(other, CharsetMatch):

53 raise ValueError

55 chaos_difference: float = abs(self.chaos - other.chaos)

56 coherence_difference: float = abs(self.coherence - other.coherence)

58 # Below 1% difference --> Use Coherence

59 if chaos_difference < 0.01 and coherence_difference > 0.02:

60 return self.coherence > other.coherence

61 elif chaos_difference < 0.01 and coherence_difference <= 0.02:

62 # When having a difficult decision, use the result that decoded as many multi-byte as possible.

63 # preserve RAM usage!

64 if len(self._payload) >= TOO_BIG_SEQUENCE:

65 return self.chaos < other.chaos

66 return self.multi_byte_usage > other.multi_byte_usage

68 return self.chaos < other.chaos

70 @property

71 def multi_byte_usage(self) -> float:

72 return 1.0 - (len(str(self)) / len(self.raw))

74 def __str__(self) -> str:

75 # Lazy Str Loading

76 if self._string is None:

77 self._string = str(self._payload, self._encoding, "strict")

78 return self._string

80 def __repr__(self) -> str:

81 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"

83 def add_submatch(self, other: CharsetMatch) -> None:

84 if not isinstance(other, CharsetMatch) or other == self:

85 raise ValueError(

86 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

87 other.__class__

88 )

89 )

91 other._string = None # Unload RAM usage; dirty trick.

92 self._leaves.append(other)

94 @property

95 def encoding(self) -> str:

96 return self._encoding

98 @property

99 def encoding_aliases(self) -> list[str]:

100 """

101 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

102 """

103 also_known_as: list[str] = []

104 for u, p in aliases.items():

105 if self.encoding == u:

106 also_known_as.append(p)

107 elif self.encoding == p:

108 also_known_as.append(u)

109 return also_known_as

110

111 @property

112 def bom(self) -> bool:

113 return self._has_sig_or_bom

114

115 @property

116 def byte_order_mark(self) -> bool:

117 return self._has_sig_or_bom

118

119 @property

120 def languages(self) -> list[str]:

121 """

122 Return the complete list of possible languages found in decoded sequence.

123 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

124 """

125 return [e[0] for e in self._languages]

126

127 @property

128 def language(self) -> str:

129 """

130 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

131 "Unknown".

132 """

133 if not self._languages:

134 # Trying to infer the language based on the given encoding

135 # Its either English or we should not pronounce ourselves in certain cases.

136 if "ascii" in self.could_be_from_charset:

137 return "English"

138

139 # doing it there to avoid circular import

140 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

141

142 languages = (

143 mb_encoding_languages(self.encoding)

144 if is_multi_byte_encoding(self.encoding)

145 else encoding_languages(self.encoding)

146 )

147

148 if len(languages) == 0 or "Latin Based" in languages:

149 return "Unknown"

150

151 return languages[0]

152

153 return self._languages[0][0]

154

155 @property

156 def chaos(self) -> float:

157 return self._mean_mess_ratio

158

159 @property

160 def coherence(self) -> float:

161 if not self._languages:

162 return 0.0

163 return self._languages[0][1]

164

165 @property

166 def percent_chaos(self) -> float:

167 return round(self.chaos * 100, ndigits=3)

168

169 @property

170 def percent_coherence(self) -> float:

171 return round(self.coherence * 100, ndigits=3)

172

173 @property

174 def raw(self) -> bytes:

175 """

176 Original untouched bytes.

177 """

178 return self._payload

179

180 @property

181 def submatch(self) -> list[CharsetMatch]:

182 return self._leaves

183

184 @property

185 def has_submatch(self) -> bool:

186 return len(self._leaves) > 0

187

188 @property

189 def alphabets(self) -> list[str]:

190 if self._unicode_ranges is not None:

191 return self._unicode_ranges

192 # list detected ranges

193 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]

194 # filter and sort

195 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

196 return self._unicode_ranges

197

198 @property

199 def could_be_from_charset(self) -> list[str]:

200 """

201 The complete list of encoding that output the exact SAME str result and therefore could be the originating

202 encoding.

203 This list does include the encoding available in property 'encoding'.

204 """

205 return [self._encoding] + [m.encoding for m in self._leaves]

206

207 def output(self, encoding: str = "utf_8") -> bytes:

208 """

209 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

210 Any errors will be simply ignored by the encoder NOT replaced.

211 """

212 if self._output_encoding is None or self._output_encoding != encoding:

213 self._output_encoding = encoding

214 decoded_string = str(self)

215 if (

216 self._preemptive_declaration is not None

217 and self._preemptive_declaration.lower()

218 not in ["utf-8", "utf8", "utf_8"]

219 ):

220 patched_header = sub(

221 RE_POSSIBLE_ENCODING_INDICATION,

222 lambda m: m.string[m.span()[0] : m.span()[1]].replace(

223 m.groups()[0],

224 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]

225 ),

226 decoded_string[:8192],

227 count=1,

228 )

229

230 decoded_string = patched_header + decoded_string[8192:]

231

232 self._output_payload = decoded_string.encode(encoding, "replace")

233

234 return self._output_payload # type: ignore

235

236 @property

237 def fingerprint(self) -> int:

238 """

239 Retrieve a hash fingerprint of the decoded payload, used for deduplication.

240 """

241 return hash(str(self))

242

243

244class CharsetMatches:

245 """

246 Container with every CharsetMatch items ordered by default from most probable to the less one.

247 Act like a list(iterable) but does not implements all related methods.

248 """

249

250 def __init__(self, results: list[CharsetMatch] | None = None):

251 self._results: list[CharsetMatch] = sorted(results) if results else []

252

253 def __iter__(self) -> Iterator[CharsetMatch]:

254 yield from self._results

255

256 def __getitem__(self, item: int | str) -> CharsetMatch:

257 """

258 Retrieve a single item either by its position or encoding name (alias may be used here).

259 Raise KeyError upon invalid index or encoding not present in results.

260 """

261 if isinstance(item, int):

262 return self._results[item]

263 if isinstance(item, str):

264 item = iana_name(item, False)

265 for result in self._results:

266 if item in result.could_be_from_charset:

267 return result

268 raise KeyError

269

270 def __len__(self) -> int:

271 return len(self._results)

272

273 def __bool__(self) -> bool:

274 return len(self._results) > 0

275

276 def append(self, item: CharsetMatch) -> None:

277 """

278 Insert a single match. Will be inserted accordingly to preserve sort.

279 Can be inserted as a submatch.

280 """

281 if not isinstance(item, CharsetMatch):

282 raise ValueError(

283 "Cannot append instance '{}' to CharsetMatches".format(

284 str(item.__class__)

285 )

286 )

287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

288 if len(item.raw) < TOO_BIG_SEQUENCE:

289 for match in self._results:

290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

291 match.add_submatch(item)

292 return

293 self._results.append(item)

294 self._results = sorted(self._results)

295

296 def best(self) -> CharsetMatch | None:

297 """

298 Simply return the first match. Strict equivalent to matches[0].

299 """

300 if not self._results:

301 return None

302 return self._results[0]

303

304 def first(self) -> CharsetMatch | None:

305 """

306 Redundant method, call the method best(). Kept for BC reasons.

307 """

308 return self.best()

309

310

311CoherenceMatch = Tuple[str, float]

312CoherenceMatches = List[CoherenceMatch]

313

314

315class CliDetectionResult:

316 def __init__(

317 self,

318 path: str,

319 encoding: str | None,

320 encoding_aliases: list[str],

321 alternative_encodings: list[str],

322 language: str,

323 alphabets: list[str],

324 has_sig_or_bom: bool,

325 chaos: float,

326 coherence: float,

327 unicode_path: str | None,

328 is_preferred: bool,

329 ):

330 self.path: str = path

331 self.unicode_path: str | None = unicode_path

332 self.encoding: str | None = encoding

333 self.encoding_aliases: list[str] = encoding_aliases

334 self.alternative_encodings: list[str] = alternative_encodings

335 self.language: str = language

336 self.alphabets: list[str] = alphabets

337 self.has_sig_or_bom: bool = has_sig_or_bom

338 self.chaos: float = chaos

339 self.coherence: float = coherence

340 self.is_preferred: bool = is_preferred

341

342 @property

343 def __dict__(self) -> dict[str, Any]: # type: ignore

344 return {

345 "path": self.path,

346 "encoding": self.encoding,

347 "encoding_aliases": self.encoding_aliases,

348 "alternative_encodings": self.alternative_encodings,

349 "language": self.language,

350 "alphabets": self.alphabets,

351 "has_sig_or_bom": self.has_sig_or_bom,

352 "chaos": self.chaos,

353 "coherence": self.coherence,

354 "unicode_path": self.unicode_path,

355 "is_preferred": self.is_preferred,

356 }

357

358 def to_json(self) -> str:

359 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 35%

184 statements