Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3from encodings.aliases import aliases

4from hashlib import sha256

5from json import dumps

6from re import sub

7from typing import Any, Iterator, List, Tuple

9from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE

10from .utils import iana_name, is_multi_byte_encoding, unicode_range

13class CharsetMatch:

14 def __init__(

15 self,

16 payload: bytes,

17 guessed_encoding: str,

18 mean_mess_ratio: float,

19 has_sig_or_bom: bool,

20 languages: CoherenceMatches,

21 decoded_payload: str | None = None,

22 preemptive_declaration: str | None = None,

23 ):

24 self._payload: bytes = payload

26 self._encoding: str = guessed_encoding

27 self._mean_mess_ratio: float = mean_mess_ratio

28 self._languages: CoherenceMatches = languages

29 self._has_sig_or_bom: bool = has_sig_or_bom

30 self._unicode_ranges: list[str] | None = None

32 self._leaves: list[CharsetMatch] = []

33 self._mean_coherence_ratio: float = 0.0

35 self._output_payload: bytes | None = None

36 self._output_encoding: str | None = None

38 self._string: str | None = decoded_payload

40 self._preemptive_declaration: str | None = preemptive_declaration

42 def __eq__(self, other: object) -> bool:

43 if not isinstance(other, CharsetMatch):

44 if isinstance(other, str):

45 return iana_name(other) == self.encoding

46 return False

47 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

49 def __lt__(self, other: object) -> bool:

50 """

51 Implemented to make sorted available upon CharsetMatches items.

52 """

53 if not isinstance(other, CharsetMatch):

54 raise ValueError

56 chaos_difference: float = abs(self.chaos - other.chaos)

57 coherence_difference: float = abs(self.coherence - other.coherence)

59 # Below 1% difference --> Use Coherence

60 if chaos_difference < 0.01 and coherence_difference > 0.02:

61 return self.coherence > other.coherence

62 elif chaos_difference < 0.01 and coherence_difference <= 0.02:

63 # When having a difficult decision, use the result that decoded as many multi-byte as possible.

64 # preserve RAM usage!

65 if len(self._payload) >= TOO_BIG_SEQUENCE:

66 return self.chaos < other.chaos

67 return self.multi_byte_usage > other.multi_byte_usage

69 return self.chaos < other.chaos

71 @property

72 def multi_byte_usage(self) -> float:

73 return 1.0 - (len(str(self)) / len(self.raw))

75 def __str__(self) -> str:

76 # Lazy Str Loading

77 if self._string is None:

78 self._string = str(self._payload, self._encoding, "strict")

79 return self._string

81 def __repr__(self) -> str:

82 return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"

84 def add_submatch(self, other: CharsetMatch) -> None:

85 if not isinstance(other, CharsetMatch) or other == self:

86 raise ValueError(

87 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

88 other.__class__

89 )

90 )

92 other._string = None # Unload RAM usage; dirty trick.

93 self._leaves.append(other)

95 @property

96 def encoding(self) -> str:

97 return self._encoding

99 @property

100 def encoding_aliases(self) -> list[str]:

101 """

102 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

103 """

104 also_known_as: list[str] = []

105 for u, p in aliases.items():

106 if self.encoding == u:

107 also_known_as.append(p)

108 elif self.encoding == p:

109 also_known_as.append(u)

110 return also_known_as

111

112 @property

113 def bom(self) -> bool:

114 return self._has_sig_or_bom

115

116 @property

117 def byte_order_mark(self) -> bool:

118 return self._has_sig_or_bom

119

120 @property

121 def languages(self) -> list[str]:

122 """

123 Return the complete list of possible languages found in decoded sequence.

124 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

125 """

126 return [e[0] for e in self._languages]

127

128 @property

129 def language(self) -> str:

130 """

131 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

132 "Unknown".

133 """

134 if not self._languages:

135 # Trying to infer the language based on the given encoding

136 # Its either English or we should not pronounce ourselves in certain cases.

137 if "ascii" in self.could_be_from_charset:

138 return "English"

139

140 # doing it there to avoid circular import

141 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

142

143 languages = (

144 mb_encoding_languages(self.encoding)

145 if is_multi_byte_encoding(self.encoding)

146 else encoding_languages(self.encoding)

147 )

148

149 if len(languages) == 0 or "Latin Based" in languages:

150 return "Unknown"

151

152 return languages[0]

153

154 return self._languages[0][0]

155

156 @property

157 def chaos(self) -> float:

158 return self._mean_mess_ratio

159

160 @property

161 def coherence(self) -> float:

162 if not self._languages:

163 return 0.0

164 return self._languages[0][1]

165

166 @property

167 def percent_chaos(self) -> float:

168 return round(self.chaos * 100, ndigits=3)

169

170 @property

171 def percent_coherence(self) -> float:

172 return round(self.coherence * 100, ndigits=3)

173

174 @property

175 def raw(self) -> bytes:

176 """

177 Original untouched bytes.

178 """

179 return self._payload

180

181 @property

182 def submatch(self) -> list[CharsetMatch]:

183 return self._leaves

184

185 @property

186 def has_submatch(self) -> bool:

187 return len(self._leaves) > 0

188

189 @property

190 def alphabets(self) -> list[str]:

191 if self._unicode_ranges is not None:

192 return self._unicode_ranges

193 # list detected ranges

194 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]

195 # filter and sort

196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

197 return self._unicode_ranges

198

199 @property

200 def could_be_from_charset(self) -> list[str]:

201 """

202 The complete list of encoding that output the exact SAME str result and therefore could be the originating

203 encoding.

204 This list does include the encoding available in property 'encoding'.

205 """

206 return [self._encoding] + [m.encoding for m in self._leaves]

207

208 def output(self, encoding: str = "utf_8") -> bytes:

209 """

210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

211 Any errors will be simply ignored by the encoder NOT replaced.

212 """

213 if self._output_encoding is None or self._output_encoding != encoding:

214 self._output_encoding = encoding

215 decoded_string = str(self)

216 if (

217 self._preemptive_declaration is not None

218 and self._preemptive_declaration.lower()

219 not in ["utf-8", "utf8", "utf_8"]

220 ):

221 patched_header = sub(

222 RE_POSSIBLE_ENCODING_INDICATION,

223 lambda m: m.string[m.span()[0] : m.span()[1]].replace(

224 m.groups()[0],

225 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]

226 ),

227 decoded_string[:8192],

228 count=1,

229 )

230

231 decoded_string = patched_header + decoded_string[8192:]

232

233 self._output_payload = decoded_string.encode(encoding, "replace")

234

235 return self._output_payload # type: ignore

236

237 @property

238 def fingerprint(self) -> str:

239 """

240 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.

241 """

242 return sha256(self.output()).hexdigest()

243

244

245class CharsetMatches:

246 """

247 Container with every CharsetMatch items ordered by default from most probable to the less one.

248 Act like a list(iterable) but does not implements all related methods.

249 """

250

251 def __init__(self, results: list[CharsetMatch] | None = None):

252 self._results: list[CharsetMatch] = sorted(results) if results else []

253

254 def __iter__(self) -> Iterator[CharsetMatch]:

255 yield from self._results

256

257 def __getitem__(self, item: int | str) -> CharsetMatch:

258 """

259 Retrieve a single item either by its position or encoding name (alias may be used here).

260 Raise KeyError upon invalid index or encoding not present in results.

261 """

262 if isinstance(item, int):

263 return self._results[item]

264 if isinstance(item, str):

265 item = iana_name(item, False)

266 for result in self._results:

267 if item in result.could_be_from_charset:

268 return result

269 raise KeyError

270

271 def __len__(self) -> int:

272 return len(self._results)

273

274 def __bool__(self) -> bool:

275 return len(self._results) > 0

276

277 def append(self, item: CharsetMatch) -> None:

278 """

279 Insert a single match. Will be inserted accordingly to preserve sort.

280 Can be inserted as a submatch.

281 """

282 if not isinstance(item, CharsetMatch):

283 raise ValueError(

284 "Cannot append instance '{}' to CharsetMatches".format(

285 str(item.__class__)

286 )

287 )

288 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

289 if len(item.raw) < TOO_BIG_SEQUENCE:

290 for match in self._results:

291 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

292 match.add_submatch(item)

293 return

294 self._results.append(item)

295 self._results = sorted(self._results)

296

297 def best(self) -> CharsetMatch | None:

298 """

299 Simply return the first match. Strict equivalent to matches[0].

300 """

301 if not self._results:

302 return None

303 return self._results[0]

304

305 def first(self) -> CharsetMatch | None:

306 """

307 Redundant method, call the method best(). Kept for BC reasons.

308 """

309 return self.best()

310

311

312CoherenceMatch = Tuple[str, float]

313CoherenceMatches = List[CoherenceMatch]

314

315

316class CliDetectionResult:

317 def __init__(

318 self,

319 path: str,

320 encoding: str | None,

321 encoding_aliases: list[str],

322 alternative_encodings: list[str],

323 language: str,

324 alphabets: list[str],

325 has_sig_or_bom: bool,

326 chaos: float,

327 coherence: float,

328 unicode_path: str | None,

329 is_preferred: bool,

330 ):

331 self.path: str = path

332 self.unicode_path: str | None = unicode_path

333 self.encoding: str | None = encoding

334 self.encoding_aliases: list[str] = encoding_aliases

335 self.alternative_encodings: list[str] = alternative_encodings

336 self.language: str = language

337 self.alphabets: list[str] = alphabets

338 self.has_sig_or_bom: bool = has_sig_or_bom

339 self.chaos: float = chaos

340 self.coherence: float = coherence

341 self.is_preferred: bool = is_preferred

342

343 @property

344 def __dict__(self) -> dict[str, Any]: # type: ignore

345 return {

346 "path": self.path,

347 "encoding": self.encoding,

348 "encoding_aliases": self.encoding_aliases,

349 "alternative_encodings": self.alternative_encodings,

350 "language": self.language,

351 "alphabets": self.alphabets,

352 "has_sig_or_bom": self.has_sig_or_bom,

353 "chaos": self.chaos,

354 "coherence": self.coherence,

355 "unicode_path": self.unicode_path,

356 "is_preferred": self.is_preferred,

357 }

358

359 def to_json(self) -> str:

360 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 36%

185 statements