Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/charset

1from encodings.aliases import aliases

2from hashlib import sha256

3from json import dumps

4from re import sub

5from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

7from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE

8from .utils import iana_name, is_multi_byte_encoding, unicode_range

11class CharsetMatch:

12 def __init__(

13 self,

14 payload: bytes,

15 guessed_encoding: str,

16 mean_mess_ratio: float,

17 has_sig_or_bom: bool,

18 languages: "CoherenceMatches",

19 decoded_payload: Optional[str] = None,

20 preemptive_declaration: Optional[str] = None,

21 ):

22 self._payload: bytes = payload

24 self._encoding: str = guessed_encoding

25 self._mean_mess_ratio: float = mean_mess_ratio

26 self._languages: CoherenceMatches = languages

27 self._has_sig_or_bom: bool = has_sig_or_bom

28 self._unicode_ranges: Optional[List[str]] = None

30 self._leaves: List[CharsetMatch] = []

31 self._mean_coherence_ratio: float = 0.0

33 self._output_payload: Optional[bytes] = None

34 self._output_encoding: Optional[str] = None

36 self._string: Optional[str] = decoded_payload

38 self._preemptive_declaration: Optional[str] = preemptive_declaration

40 def __eq__(self, other: object) -> bool:

41 if not isinstance(other, CharsetMatch):

42 if isinstance(other, str):

43 return iana_name(other) == self.encoding

44 return False

45 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

47 def __lt__(self, other: object) -> bool:

48 """

49 Implemented to make sorted available upon CharsetMatches items.

50 """

51 if not isinstance(other, CharsetMatch):

52 raise ValueError

54 chaos_difference: float = abs(self.chaos - other.chaos)

55 coherence_difference: float = abs(self.coherence - other.coherence)

57 # Below 1% difference --> Use Coherence

58 if chaos_difference < 0.01 and coherence_difference > 0.02:

59 return self.coherence > other.coherence

60 elif chaos_difference < 0.01 and coherence_difference <= 0.02:

61 # When having a difficult decision, use the result that decoded as many multi-byte as possible.

62 # preserve RAM usage!

63 if len(self._payload) >= TOO_BIG_SEQUENCE:

64 return self.chaos < other.chaos

65 return self.multi_byte_usage > other.multi_byte_usage

67 return self.chaos < other.chaos

69 @property

70 def multi_byte_usage(self) -> float:

71 return 1.0 - (len(str(self)) / len(self.raw))

73 def __str__(self) -> str:

74 # Lazy Str Loading

75 if self._string is None:

76 self._string = str(self._payload, self._encoding, "strict")

77 return self._string

79 def __repr__(self) -> str:

80 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)

82 def add_submatch(self, other: "CharsetMatch") -> None:

83 if not isinstance(other, CharsetMatch) or other == self:

84 raise ValueError(

85 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

86 other.__class__

87 )

88 )

90 other._string = None # Unload RAM usage; dirty trick.

91 self._leaves.append(other)

93 @property

94 def encoding(self) -> str:

95 return self._encoding

97 @property

98 def encoding_aliases(self) -> List[str]:

99 """

100 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

101 """

102 also_known_as: List[str] = []

103 for u, p in aliases.items():

104 if self.encoding == u:

105 also_known_as.append(p)

106 elif self.encoding == p:

107 also_known_as.append(u)

108 return also_known_as

109

110 @property

111 def bom(self) -> bool:

112 return self._has_sig_or_bom

113

114 @property

115 def byte_order_mark(self) -> bool:

116 return self._has_sig_or_bom

117

118 @property

119 def languages(self) -> List[str]:

120 """

121 Return the complete list of possible languages found in decoded sequence.

122 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

123 """

124 return [e[0] for e in self._languages]

125

126 @property

127 def language(self) -> str:

128 """

129 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

130 "Unknown".

131 """

132 if not self._languages:

133 # Trying to infer the language based on the given encoding

134 # Its either English or we should not pronounce ourselves in certain cases.

135 if "ascii" in self.could_be_from_charset:

136 return "English"

137

138 # doing it there to avoid circular import

139 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

140

141 languages = (

142 mb_encoding_languages(self.encoding)

143 if is_multi_byte_encoding(self.encoding)

144 else encoding_languages(self.encoding)

145 )

146

147 if len(languages) == 0 or "Latin Based" in languages:

148 return "Unknown"

149

150 return languages[0]

151

152 return self._languages[0][0]

153

154 @property

155 def chaos(self) -> float:

156 return self._mean_mess_ratio

157

158 @property

159 def coherence(self) -> float:

160 if not self._languages:

161 return 0.0

162 return self._languages[0][1]

163

164 @property

165 def percent_chaos(self) -> float:

166 return round(self.chaos * 100, ndigits=3)

167

168 @property

169 def percent_coherence(self) -> float:

170 return round(self.coherence * 100, ndigits=3)

171

172 @property

173 def raw(self) -> bytes:

174 """

175 Original untouched bytes.

176 """

177 return self._payload

178

179 @property

180 def submatch(self) -> List["CharsetMatch"]:

181 return self._leaves

182

183 @property

184 def has_submatch(self) -> bool:

185 return len(self._leaves) > 0

186

187 @property

188 def alphabets(self) -> List[str]:

189 if self._unicode_ranges is not None:

190 return self._unicode_ranges

191 # list detected ranges

192 detected_ranges: List[Optional[str]] = [

193 unicode_range(char) for char in str(self)

194 ]

195 # filter and sort

196 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

197 return self._unicode_ranges

198

199 @property

200 def could_be_from_charset(self) -> List[str]:

201 """

202 The complete list of encoding that output the exact SAME str result and therefore could be the originating

203 encoding.

204 This list does include the encoding available in property 'encoding'.

205 """

206 return [self._encoding] + [m.encoding for m in self._leaves]

207

208 def output(self, encoding: str = "utf_8") -> bytes:

209 """

210 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

211 Any errors will be simply ignored by the encoder NOT replaced.

212 """

213 if self._output_encoding is None or self._output_encoding != encoding:

214 self._output_encoding = encoding

215 decoded_string = str(self)

216 if (

217 self._preemptive_declaration is not None

218 and self._preemptive_declaration.lower()

219 not in ["utf-8", "utf8", "utf_8"]

220 ):

221 patched_header = sub(

222 RE_POSSIBLE_ENCODING_INDICATION,

223 lambda m: m.string[m.span()[0] : m.span()[1]].replace(

224 m.groups()[0], iana_name(self._output_encoding) # type: ignore[arg-type]

225 ),

226 decoded_string[:8192],

227 1,

228 )

229

230 decoded_string = patched_header + decoded_string[8192:]

231

232 self._output_payload = decoded_string.encode(encoding, "replace")

233

234 return self._output_payload # type: ignore

235

236 @property

237 def fingerprint(self) -> str:

238 """

239 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.

240 """

241 return sha256(self.output()).hexdigest()

242

243

244class CharsetMatches:

245 """

246 Container with every CharsetMatch items ordered by default from most probable to the less one.

247 Act like a list(iterable) but does not implements all related methods.

248 """

249

250 def __init__(self, results: Optional[List[CharsetMatch]] = None):

251 self._results: List[CharsetMatch] = sorted(results) if results else []

252

253 def __iter__(self) -> Iterator[CharsetMatch]:

254 yield from self._results

255

256 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:

257 """

258 Retrieve a single item either by its position or encoding name (alias may be used here).

259 Raise KeyError upon invalid index or encoding not present in results.

260 """

261 if isinstance(item, int):

262 return self._results[item]

263 if isinstance(item, str):

264 item = iana_name(item, False)

265 for result in self._results:

266 if item in result.could_be_from_charset:

267 return result

268 raise KeyError

269

270 def __len__(self) -> int:

271 return len(self._results)

272

273 def __bool__(self) -> bool:

274 return len(self._results) > 0

275

276 def append(self, item: CharsetMatch) -> None:

277 """

278 Insert a single match. Will be inserted accordingly to preserve sort.

279 Can be inserted as a submatch.

280 """

281 if not isinstance(item, CharsetMatch):

282 raise ValueError(

283 "Cannot append instance '{}' to CharsetMatches".format(

284 str(item.__class__)

285 )

286 )

287 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

288 if len(item.raw) < TOO_BIG_SEQUENCE:

289 for match in self._results:

290 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

291 match.add_submatch(item)

292 return

293 self._results.append(item)

294 self._results = sorted(self._results)

295

296 def best(self) -> Optional["CharsetMatch"]:

297 """

298 Simply return the first match. Strict equivalent to matches[0].

299 """

300 if not self._results:

301 return None

302 return self._results[0]

303

304 def first(self) -> Optional["CharsetMatch"]:

305 """

306 Redundant method, call the method best(). Kept for BC reasons.

307 """

308 return self.best()

309

310

311CoherenceMatch = Tuple[str, float]

312CoherenceMatches = List[CoherenceMatch]

313

314

315class CliDetectionResult:

316 def __init__(

317 self,

318 path: str,

319 encoding: Optional[str],

320 encoding_aliases: List[str],

321 alternative_encodings: List[str],

322 language: str,

323 alphabets: List[str],

324 has_sig_or_bom: bool,

325 chaos: float,

326 coherence: float,

327 unicode_path: Optional[str],

328 is_preferred: bool,

329 ):

330 self.path: str = path

331 self.unicode_path: Optional[str] = unicode_path

332 self.encoding: Optional[str] = encoding

333 self.encoding_aliases: List[str] = encoding_aliases

334 self.alternative_encodings: List[str] = alternative_encodings

335 self.language: str = language

336 self.alphabets: List[str] = alphabets

337 self.has_sig_or_bom: bool = has_sig_or_bom

338 self.chaos: float = chaos

339 self.coherence: float = coherence

340 self.is_preferred: bool = is_preferred

341

342 @property

343 def __dict__(self) -> Dict[str, Any]: # type: ignore

344 return {

345 "path": self.path,

346 "encoding": self.encoding,

347 "encoding_aliases": self.encoding_aliases,

348 "alternative_encodings": self.alternative_encodings,

349 "language": self.language,

350 "alphabets": self.alphabets,

351 "has_sig_or_bom": self.has_sig_or_bom,

352 "chaos": self.chaos,

353 "coherence": self.coherence,

354 "unicode_path": self.unicode_path,

355 "is_preferred": self.is_preferred,

356 }

357

358 def to_json(self) -> str:

359 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/charset_normalizer/models.py: 35%

184 statements