Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset

1from encodings.aliases import aliases

2from hashlib import sha256

3from json import dumps

4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

6from .constant import TOO_BIG_SEQUENCE

7from .utils import iana_name, is_multi_byte_encoding, unicode_range

10class CharsetMatch:

11 def __init__(

12 self,

13 payload: bytes,

14 guessed_encoding: str,

15 mean_mess_ratio: float,

16 has_sig_or_bom: bool,

17 languages: "CoherenceMatches",

18 decoded_payload: Optional[str] = None,

19 ):

20 self._payload: bytes = payload

22 self._encoding: str = guessed_encoding

23 self._mean_mess_ratio: float = mean_mess_ratio

24 self._languages: CoherenceMatches = languages

25 self._has_sig_or_bom: bool = has_sig_or_bom

26 self._unicode_ranges: Optional[List[str]] = None

28 self._leaves: List[CharsetMatch] = []

29 self._mean_coherence_ratio: float = 0.0

31 self._output_payload: Optional[bytes] = None

32 self._output_encoding: Optional[str] = None

34 self._string: Optional[str] = decoded_payload

36 def __eq__(self, other: object) -> bool:

37 if not isinstance(other, CharsetMatch):

38 if isinstance(other, str):

39 return iana_name(other) == self.encoding

40 return False

41 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

43 def __lt__(self, other: object) -> bool:

44 """

45 Implemented to make sorted available upon CharsetMatches items.

46 """

47 if not isinstance(other, CharsetMatch):

48 raise ValueError

50 chaos_difference: float = abs(self.chaos - other.chaos)

51 coherence_difference: float = abs(self.coherence - other.coherence)

53 # Below 1% difference --> Use Coherence

54 if chaos_difference < 0.01 and coherence_difference > 0.02:

55 return self.coherence > other.coherence

56 elif chaos_difference < 0.01 and coherence_difference <= 0.02:

57 # When having a difficult decision, use the result that decoded as many multi-byte as possible.

58 # preserve RAM usage!

59 if len(self._payload) >= TOO_BIG_SEQUENCE:

60 return self.chaos < other.chaos

61 return self.multi_byte_usage > other.multi_byte_usage

63 return self.chaos < other.chaos

65 @property

66 def multi_byte_usage(self) -> float:

67 return 1.0 - (len(str(self)) / len(self.raw))

69 def __str__(self) -> str:

70 # Lazy Str Loading

71 if self._string is None:

72 self._string = str(self._payload, self._encoding, "strict")

73 return self._string

75 def __repr__(self) -> str:

76 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)

78 def add_submatch(self, other: "CharsetMatch") -> None:

79 if not isinstance(other, CharsetMatch) or other == self:

80 raise ValueError(

81 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

82 other.__class__

83 )

84 )

86 other._string = None # Unload RAM usage; dirty trick.

87 self._leaves.append(other)

89 @property

90 def encoding(self) -> str:

91 return self._encoding

93 @property

94 def encoding_aliases(self) -> List[str]:

95 """

96 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

97 """

98 also_known_as: List[str] = []

99 for u, p in aliases.items():

100 if self.encoding == u:

101 also_known_as.append(p)

102 elif self.encoding == p:

103 also_known_as.append(u)

104 return also_known_as

105

106 @property

107 def bom(self) -> bool:

108 return self._has_sig_or_bom

109

110 @property

111 def byte_order_mark(self) -> bool:

112 return self._has_sig_or_bom

113

114 @property

115 def languages(self) -> List[str]:

116 """

117 Return the complete list of possible languages found in decoded sequence.

118 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

119 """

120 return [e[0] for e in self._languages]

121

122 @property

123 def language(self) -> str:

124 """

125 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

126 "Unknown".

127 """

128 if not self._languages:

129 # Trying to infer the language based on the given encoding

130 # Its either English or we should not pronounce ourselves in certain cases.

131 if "ascii" in self.could_be_from_charset:

132 return "English"

133

134 # doing it there to avoid circular import

135 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

136

137 languages = (

138 mb_encoding_languages(self.encoding)

139 if is_multi_byte_encoding(self.encoding)

140 else encoding_languages(self.encoding)

141 )

142

143 if len(languages) == 0 or "Latin Based" in languages:

144 return "Unknown"

145

146 return languages[0]

147

148 return self._languages[0][0]

149

150 @property

151 def chaos(self) -> float:

152 return self._mean_mess_ratio

153

154 @property

155 def coherence(self) -> float:

156 if not self._languages:

157 return 0.0

158 return self._languages[0][1]

159

160 @property

161 def percent_chaos(self) -> float:

162 return round(self.chaos * 100, ndigits=3)

163

164 @property

165 def percent_coherence(self) -> float:

166 return round(self.coherence * 100, ndigits=3)

167

168 @property

169 def raw(self) -> bytes:

170 """

171 Original untouched bytes.

172 """

173 return self._payload

174

175 @property

176 def submatch(self) -> List["CharsetMatch"]:

177 return self._leaves

178

179 @property

180 def has_submatch(self) -> bool:

181 return len(self._leaves) > 0

182

183 @property

184 def alphabets(self) -> List[str]:

185 if self._unicode_ranges is not None:

186 return self._unicode_ranges

187 # list detected ranges

188 detected_ranges: List[Optional[str]] = [

189 unicode_range(char) for char in str(self)

190 ]

191 # filter and sort

192 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

193 return self._unicode_ranges

194

195 @property

196 def could_be_from_charset(self) -> List[str]:

197 """

198 The complete list of encoding that output the exact SAME str result and therefore could be the originating

199 encoding.

200 This list does include the encoding available in property 'encoding'.

201 """

202 return [self._encoding] + [m.encoding for m in self._leaves]

203

204 def output(self, encoding: str = "utf_8") -> bytes:

205 """

206 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

207 Any errors will be simply ignored by the encoder NOT replaced.

208 """

209 if self._output_encoding is None or self._output_encoding != encoding:

210 self._output_encoding = encoding

211 self._output_payload = str(self).encode(encoding, "replace")

212

213 return self._output_payload # type: ignore

214

215 @property

216 def fingerprint(self) -> str:

217 """

218 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.

219 """

220 return sha256(self.output()).hexdigest()

221

222

223class CharsetMatches:

224 """

225 Container with every CharsetMatch items ordered by default from most probable to the less one.

226 Act like a list(iterable) but does not implements all related methods.

227 """

228

229 def __init__(self, results: Optional[List[CharsetMatch]] = None):

230 self._results: List[CharsetMatch] = sorted(results) if results else []

231

232 def __iter__(self) -> Iterator[CharsetMatch]:

233 yield from self._results

234

235 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:

236 """

237 Retrieve a single item either by its position or encoding name (alias may be used here).

238 Raise KeyError upon invalid index or encoding not present in results.

239 """

240 if isinstance(item, int):

241 return self._results[item]

242 if isinstance(item, str):

243 item = iana_name(item, False)

244 for result in self._results:

245 if item in result.could_be_from_charset:

246 return result

247 raise KeyError

248

249 def __len__(self) -> int:

250 return len(self._results)

251

252 def __bool__(self) -> bool:

253 return len(self._results) > 0

254

255 def append(self, item: CharsetMatch) -> None:

256 """

257 Insert a single match. Will be inserted accordingly to preserve sort.

258 Can be inserted as a submatch.

259 """

260 if not isinstance(item, CharsetMatch):

261 raise ValueError(

262 "Cannot append instance '{}' to CharsetMatches".format(

263 str(item.__class__)

264 )

265 )

266 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

267 if len(item.raw) <= TOO_BIG_SEQUENCE:

268 for match in self._results:

269 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

270 match.add_submatch(item)

271 return

272 self._results.append(item)

273 self._results = sorted(self._results)

274

275 def best(self) -> Optional["CharsetMatch"]:

276 """

277 Simply return the first match. Strict equivalent to matches[0].

278 """

279 if not self._results:

280 return None

281 return self._results[0]

282

283 def first(self) -> Optional["CharsetMatch"]:

284 """

285 Redundant method, call the method best(). Kept for BC reasons.

286 """

287 return self.best()

288

289

290CoherenceMatch = Tuple[str, float]

291CoherenceMatches = List[CoherenceMatch]

292

293

294class CliDetectionResult:

295 def __init__(

296 self,

297 path: str,

298 encoding: Optional[str],

299 encoding_aliases: List[str],

300 alternative_encodings: List[str],

301 language: str,

302 alphabets: List[str],

303 has_sig_or_bom: bool,

304 chaos: float,

305 coherence: float,

306 unicode_path: Optional[str],

307 is_preferred: bool,

308 ):

309 self.path: str = path

310 self.unicode_path: Optional[str] = unicode_path

311 self.encoding: Optional[str] = encoding

312 self.encoding_aliases: List[str] = encoding_aliases

313 self.alternative_encodings: List[str] = alternative_encodings

314 self.language: str = language

315 self.alphabets: List[str] = alphabets

316 self.has_sig_or_bom: bool = has_sig_or_bom

317 self.chaos: float = chaos

318 self.coherence: float = coherence

319 self.is_preferred: bool = is_preferred

320

321 @property

322 def __dict__(self) -> Dict[str, Any]: # type: ignore

323 return {

324 "path": self.path,

325 "encoding": self.encoding,

326 "encoding_aliases": self.encoding_aliases,

327 "alternative_encodings": self.alternative_encodings,

328 "language": self.language,

329 "alphabets": self.alphabets,

330 "has_sig_or_bom": self.has_sig_or_bom,

331 "chaos": self.chaos,

332 "coherence": self.coherence,

333 "unicode_path": self.unicode_path,

334 "is_preferred": self.is_preferred,

335 }

336

337 def to_json(self) -> str:

338 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/models.py: 69%

178 statements