Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset

1from __future__ import annotations

3from encodings.aliases import aliases

4from json import dumps

5from re import sub

6from typing import Any, Iterator, List, Tuple

8from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE

9from .utils import iana_name, is_multi_byte_encoding, unicode_range

12class CharsetMatch:

13 def __init__(

14 self,

15 payload: bytes | bytearray,

16 guessed_encoding: str,

17 mean_mess_ratio: float,

18 has_sig_or_bom: bool,

19 languages: CoherenceMatches,

20 decoded_payload: str | None = None,

21 preemptive_declaration: str | None = None,

22 ):

23 self._payload: bytes | bytearray = payload

25 self._encoding: str = guessed_encoding

26 self._mean_mess_ratio: float = mean_mess_ratio

27 self._languages: CoherenceMatches = languages

28 self._has_sig_or_bom: bool = has_sig_or_bom

29 self._unicode_ranges: list[str] | None = None

31 self._leaves: list[CharsetMatch] = []

32 self._mean_coherence_ratio: float = 0.0

34 self._output_payload: bytes | None = None

35 self._output_encoding: str | None = None

37 self._string: str | None = decoded_payload

39 self._preemptive_declaration: str | None = preemptive_declaration

41 def __eq__(self, other: object) -> bool:

42 if not isinstance(other, CharsetMatch):

43 if isinstance(other, str):

44 return iana_name(other) == self.encoding

45 return False

46 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

48 def __lt__(self, other: object) -> bool:

49 """

50 Implemented to make sorted available upon CharsetMatches items.

51 """

52 if not isinstance(other, CharsetMatch):

53 raise ValueError

55 chaos_difference: float = abs(self.chaos - other.chaos)

56 coherence_difference: float = abs(self.coherence - other.coherence)

58 # Below 0.5% difference --> Use Coherence

59 if chaos_difference < 0.005 and coherence_difference > 0.02:

60 return self.coherence > other.coherence

61 elif chaos_difference < 0.005 and coherence_difference <= 0.02:

62 # When having a difficult decision, use the result that decoded as many multi-byte as possible.

63 # preserve RAM usage!

64 if len(self._payload) >= TOO_BIG_SEQUENCE:

65 return self.chaos < other.chaos

66 return self.multi_byte_usage > other.multi_byte_usage

68 return self.chaos < other.chaos

70 @property

71 def multi_byte_usage(self) -> float:

72 return 1.0 - (len(str(self)) / len(self.raw))

74 def __str__(self) -> str:

75 # Lazy Str Loading

76 if self._string is None:

77 self._string = str(self._payload, self._encoding, "strict")

78 # UTF-7 BOM is encoded in modified Base64 whose byte boundary

79 # can overlap with the next character, so raw-byte stripping

80 # is unreliable. Strip the decoded BOM character instead.

81 if (

82 self._has_sig_or_bom

83 and self._encoding == "utf_7"

84 and self._string

85 and self._string[0] == "\ufeff"

86 ):

87 self._string = self._string[1:]

88 return self._string

90 def __repr__(self) -> str:

91 return f"<CharsetMatch '{self.encoding}' fp({self.fingerprint})>"

93 def add_submatch(self, other: CharsetMatch) -> None:

94 if not isinstance(other, CharsetMatch) or other == self:

95 raise ValueError(

96 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

97 other.__class__

98 )

99 )

100

101 other._string = None # Unload RAM usage; dirty trick.

102 self._leaves.append(other)

103

104 @property

105 def encoding(self) -> str:

106 return self._encoding

107

108 @property

109 def encoding_aliases(self) -> list[str]:

110 """

111 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

112 """

113 also_known_as: list[str] = []

114 for u, p in aliases.items():

115 if self.encoding == u:

116 also_known_as.append(p)

117 elif self.encoding == p:

118 also_known_as.append(u)

119 return also_known_as

120

121 @property

122 def bom(self) -> bool:

123 return self._has_sig_or_bom

124

125 @property

126 def byte_order_mark(self) -> bool:

127 return self._has_sig_or_bom

128

129 @property

130 def languages(self) -> list[str]:

131 """

132 Return the complete list of possible languages found in decoded sequence.

133 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

134 """

135 return [e[0] for e in self._languages]

136

137 @property

138 def language(self) -> str:

139 """

140 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

141 "Unknown".

142 """

143 if not self._languages:

144 # Trying to infer the language based on the given encoding

145 # Its either English or we should not pronounce ourselves in certain cases.

146 if "ascii" in self.could_be_from_charset:

147 return "English"

148

149 # doing it there to avoid circular import

150 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

151

152 languages = (

153 mb_encoding_languages(self.encoding)

154 if is_multi_byte_encoding(self.encoding)

155 else encoding_languages(self.encoding)

156 )

157

158 if len(languages) == 0 or "Latin Based" in languages:

159 return "Unknown"

160

161 return languages[0]

162

163 return self._languages[0][0]

164

165 @property

166 def chaos(self) -> float:

167 return self._mean_mess_ratio

168

169 @property

170 def coherence(self) -> float:

171 if not self._languages:

172 return 0.0

173 return self._languages[0][1]

174

175 @property

176 def percent_chaos(self) -> float:

177 return round(self.chaos * 100, ndigits=3)

178

179 @property

180 def percent_coherence(self) -> float:

181 return round(self.coherence * 100, ndigits=3)

182

183 @property

184 def raw(self) -> bytes | bytearray:

185 """

186 Original untouched bytes.

187 """

188 return self._payload

189

190 @property

191 def submatch(self) -> list[CharsetMatch]:

192 return self._leaves

193

194 @property

195 def has_submatch(self) -> bool:

196 return len(self._leaves) > 0

197

198 @property

199 def alphabets(self) -> list[str]:

200 if self._unicode_ranges is not None:

201 return self._unicode_ranges

202 # list detected ranges

203 detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]

204 # filter and sort

205 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

206 return self._unicode_ranges

207

208 @property

209 def could_be_from_charset(self) -> list[str]:

210 """

211 The complete list of encoding that output the exact SAME str result and therefore could be the originating

212 encoding.

213 This list does include the encoding available in property 'encoding'.

214 """

215 return [self._encoding] + [m.encoding for m in self._leaves]

216

217 def output(self, encoding: str = "utf_8") -> bytes:

218 """

219 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

220 Any errors will be simply ignored by the encoder NOT replaced.

221 """

222 if self._output_encoding is None or self._output_encoding != encoding:

223 self._output_encoding = encoding

224 decoded_string = str(self)

225 if (

226 self._preemptive_declaration is not None

227 and self._preemptive_declaration.lower()

228 not in ["utf-8", "utf8", "utf_8"]

229 ):

230 patched_header = sub(

231 RE_POSSIBLE_ENCODING_INDICATION,

232 lambda m: m.string[m.span()[0] : m.span()[1]].replace(

233 m.groups()[0],

234 iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]

235 ),

236 decoded_string[:8192],

237 count=1,

238 )

239

240 decoded_string = patched_header + decoded_string[8192:]

241

242 self._output_payload = decoded_string.encode(encoding, "replace")

243

244 return self._output_payload # type: ignore

245

246 @property

247 def fingerprint(self) -> int:

248 """

249 Retrieve a hash fingerprint of the decoded payload, used for deduplication.

250 """

251 return hash(str(self))

252

253

254class CharsetMatches:

255 """

256 Container with every CharsetMatch items ordered by default from most probable to the less one.

257 Act like a list(iterable) but does not implements all related methods.

258 """

259

260 def __init__(self, results: list[CharsetMatch] | None = None):

261 self._results: list[CharsetMatch] = sorted(results) if results else []

262

263 def __iter__(self) -> Iterator[CharsetMatch]:

264 yield from self._results

265

266 def __getitem__(self, item: int | str) -> CharsetMatch:

267 """

268 Retrieve a single item either by its position or encoding name (alias may be used here).

269 Raise KeyError upon invalid index or encoding not present in results.

270 """

271 if isinstance(item, int):

272 return self._results[item]

273 if isinstance(item, str):

274 item = iana_name(item, False)

275 for result in self._results:

276 if item in result.could_be_from_charset:

277 return result

278 raise KeyError

279

280 def __len__(self) -> int:

281 return len(self._results)

282

283 def __bool__(self) -> bool:

284 return len(self._results) > 0

285

286 def append(self, item: CharsetMatch) -> None:

287 """

288 Insert a single match. Will be inserted accordingly to preserve sort.

289 Can be inserted as a submatch.

290 """

291 if not isinstance(item, CharsetMatch):

292 raise ValueError(

293 "Cannot append instance '{}' to CharsetMatches".format(

294 str(item.__class__)

295 )

296 )

297 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

298 if len(item.raw) < TOO_BIG_SEQUENCE:

299 for match in self._results:

300 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

301 match.add_submatch(item)

302 return

303 self._results.append(item)

304 self._results = sorted(self._results)

305

306 def best(self) -> CharsetMatch | None:

307 """

308 Simply return the first match. Strict equivalent to matches[0].

309 """

310 if not self._results:

311 return None

312 return self._results[0]

313

314 def first(self) -> CharsetMatch | None:

315 """

316 Redundant method, call the method best(). Kept for BC reasons.

317 """

318 return self.best()

319

320

321CoherenceMatch = Tuple[str, float]

322CoherenceMatches = List[CoherenceMatch]

323

324

325class CliDetectionResult:

326 def __init__(

327 self,

328 path: str,

329 encoding: str | None,

330 encoding_aliases: list[str],

331 alternative_encodings: list[str],

332 language: str,

333 alphabets: list[str],

334 has_sig_or_bom: bool,

335 chaos: float,

336 coherence: float,

337 unicode_path: str | None,

338 is_preferred: bool,

339 ):

340 self.path: str = path

341 self.unicode_path: str | None = unicode_path

342 self.encoding: str | None = encoding

343 self.encoding_aliases: list[str] = encoding_aliases

344 self.alternative_encodings: list[str] = alternative_encodings

345 self.language: str = language

346 self.alphabets: list[str] = alphabets

347 self.has_sig_or_bom: bool = has_sig_or_bom

348 self.chaos: float = chaos

349 self.coherence: float = coherence

350 self.is_preferred: bool = is_preferred

351

352 @property

353 def __dict__(self) -> dict[str, Any]: # type: ignore

354 return {

355 "path": self.path,

356 "encoding": self.encoding,

357 "encoding_aliases": self.encoding_aliases,

358 "alternative_encodings": self.alternative_encodings,

359 "language": self.language,

360 "alphabets": self.alphabets,

361 "has_sig_or_bom": self.has_sig_or_bom,

362 "chaos": self.chaos,

363 "coherence": self.coherence,

364 "unicode_path": self.unicode_path,

365 "is_preferred": self.is_preferred,

366 }

367

368 def to_json(self) -> str:

369 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/charset_normalizer/models.py: 35%

186 statements