Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset

1from encodings.aliases import aliases

2from hashlib import sha256

3from json import dumps

4from typing import Any, Dict, Iterator, List, Optional, Tuple, Union

6from .constant import TOO_BIG_SEQUENCE

7from .utils import iana_name, is_multi_byte_encoding, unicode_range

10class CharsetMatch:

11 def __init__(

12 self,

13 payload: bytes,

14 guessed_encoding: str,

15 mean_mess_ratio: float,

16 has_sig_or_bom: bool,

17 languages: "CoherenceMatches",

18 decoded_payload: Optional[str] = None,

19 ):

20 self._payload: bytes = payload

22 self._encoding: str = guessed_encoding

23 self._mean_mess_ratio: float = mean_mess_ratio

24 self._languages: CoherenceMatches = languages

25 self._has_sig_or_bom: bool = has_sig_or_bom

26 self._unicode_ranges: Optional[List[str]] = None

28 self._leaves: List[CharsetMatch] = []

29 self._mean_coherence_ratio: float = 0.0

31 self._output_payload: Optional[bytes] = None

32 self._output_encoding: Optional[str] = None

34 self._string: Optional[str] = decoded_payload

36 def __eq__(self, other: object) -> bool:

37 if not isinstance(other, CharsetMatch):

38 raise TypeError(

39 "__eq__ cannot be invoked on {} and {}.".format(

40 str(other.__class__), str(self.__class__)

41 )

42 )

43 return self.encoding == other.encoding and self.fingerprint == other.fingerprint

45 def __lt__(self, other: object) -> bool:

46 """

47 Implemented to make sorted available upon CharsetMatches items.

48 """

49 if not isinstance(other, CharsetMatch):

50 raise ValueError

52 chaos_difference: float = abs(self.chaos - other.chaos)

53 coherence_difference: float = abs(self.coherence - other.coherence)

55 # Below 1% difference --> Use Coherence

56 if chaos_difference < 0.01 and coherence_difference > 0.02:

57 # When having a tough decision, use the result that decoded as many multi-byte as possible.

58 if chaos_difference == 0.0 and self.coherence == other.coherence:

59 return self.multi_byte_usage > other.multi_byte_usage

60 return self.coherence > other.coherence

62 return self.chaos < other.chaos

64 @property

65 def multi_byte_usage(self) -> float:

66 return 1.0 - len(str(self)) / len(self.raw)

68 def __str__(self) -> str:

69 # Lazy Str Loading

70 if self._string is None:

71 self._string = str(self._payload, self._encoding, "strict")

72 return self._string

74 def __repr__(self) -> str:

75 return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)

77 def add_submatch(self, other: "CharsetMatch") -> None:

78 if not isinstance(other, CharsetMatch) or other == self:

79 raise ValueError(

80 "Unable to add instance <{}> as a submatch of a CharsetMatch".format(

81 other.__class__

82 )

83 )

85 other._string = None # Unload RAM usage; dirty trick.

86 self._leaves.append(other)

88 @property

89 def encoding(self) -> str:

90 return self._encoding

92 @property

93 def encoding_aliases(self) -> List[str]:

94 """

95 Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.

96 """

97 also_known_as: List[str] = []

98 for u, p in aliases.items():

99 if self.encoding == u:

100 also_known_as.append(p)

101 elif self.encoding == p:

102 also_known_as.append(u)

103 return also_known_as

104

105 @property

106 def bom(self) -> bool:

107 return self._has_sig_or_bom

108

109 @property

110 def byte_order_mark(self) -> bool:

111 return self._has_sig_or_bom

112

113 @property

114 def languages(self) -> List[str]:

115 """

116 Return the complete list of possible languages found in decoded sequence.

117 Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.

118 """

119 return [e[0] for e in self._languages]

120

121 @property

122 def language(self) -> str:

123 """

124 Most probable language found in decoded sequence. If none were detected or inferred, the property will return

125 "Unknown".

126 """

127 if not self._languages:

128 # Trying to infer the language based on the given encoding

129 # Its either English or we should not pronounce ourselves in certain cases.

130 if "ascii" in self.could_be_from_charset:

131 return "English"

132

133 # doing it there to avoid circular import

134 from charset_normalizer.cd import encoding_languages, mb_encoding_languages

135

136 languages = (

137 mb_encoding_languages(self.encoding)

138 if is_multi_byte_encoding(self.encoding)

139 else encoding_languages(self.encoding)

140 )

141

142 if len(languages) == 0 or "Latin Based" in languages:

143 return "Unknown"

144

145 return languages[0]

146

147 return self._languages[0][0]

148

149 @property

150 def chaos(self) -> float:

151 return self._mean_mess_ratio

152

153 @property

154 def coherence(self) -> float:

155 if not self._languages:

156 return 0.0

157 return self._languages[0][1]

158

159 @property

160 def percent_chaos(self) -> float:

161 return round(self.chaos * 100, ndigits=3)

162

163 @property

164 def percent_coherence(self) -> float:

165 return round(self.coherence * 100, ndigits=3)

166

167 @property

168 def raw(self) -> bytes:

169 """

170 Original untouched bytes.

171 """

172 return self._payload

173

174 @property

175 def submatch(self) -> List["CharsetMatch"]:

176 return self._leaves

177

178 @property

179 def has_submatch(self) -> bool:

180 return len(self._leaves) > 0

181

182 @property

183 def alphabets(self) -> List[str]:

184 if self._unicode_ranges is not None:

185 return self._unicode_ranges

186 # list detected ranges

187 detected_ranges: List[Optional[str]] = [

188 unicode_range(char) for char in str(self)

189 ]

190 # filter and sort

191 self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))

192 return self._unicode_ranges

193

194 @property

195 def could_be_from_charset(self) -> List[str]:

196 """

197 The complete list of encoding that output the exact SAME str result and therefore could be the originating

198 encoding.

199 This list does include the encoding available in property 'encoding'.

200 """

201 return [self._encoding] + [m.encoding for m in self._leaves]

202

203 def output(self, encoding: str = "utf_8") -> bytes:

204 """

205 Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.

206 Any errors will be simply ignored by the encoder NOT replaced.

207 """

208 if self._output_encoding is None or self._output_encoding != encoding:

209 self._output_encoding = encoding

210 self._output_payload = str(self).encode(encoding, "replace")

211

212 return self._output_payload # type: ignore

213

214 @property

215 def fingerprint(self) -> str:

216 """

217 Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.

218 """

219 return sha256(self.output()).hexdigest()

220

221

222class CharsetMatches:

223 """

224 Container with every CharsetMatch items ordered by default from most probable to the less one.

225 Act like a list(iterable) but does not implements all related methods.

226 """

227

228 def __init__(self, results: Optional[List[CharsetMatch]] = None):

229 self._results: List[CharsetMatch] = sorted(results) if results else []

230

231 def __iter__(self) -> Iterator[CharsetMatch]:

232 yield from self._results

233

234 def __getitem__(self, item: Union[int, str]) -> CharsetMatch:

235 """

236 Retrieve a single item either by its position or encoding name (alias may be used here).

237 Raise KeyError upon invalid index or encoding not present in results.

238 """

239 if isinstance(item, int):

240 return self._results[item]

241 if isinstance(item, str):

242 item = iana_name(item, False)

243 for result in self._results:

244 if item in result.could_be_from_charset:

245 return result

246 raise KeyError

247

248 def __len__(self) -> int:

249 return len(self._results)

250

251 def __bool__(self) -> bool:

252 return len(self._results) > 0

253

254 def append(self, item: CharsetMatch) -> None:

255 """

256 Insert a single match. Will be inserted accordingly to preserve sort.

257 Can be inserted as a submatch.

258 """

259 if not isinstance(item, CharsetMatch):

260 raise ValueError(

261 "Cannot append instance '{}' to CharsetMatches".format(

262 str(item.__class__)

263 )

264 )

265 # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)

266 if len(item.raw) <= TOO_BIG_SEQUENCE:

267 for match in self._results:

268 if match.fingerprint == item.fingerprint and match.chaos == item.chaos:

269 match.add_submatch(item)

270 return

271 self._results.append(item)

272 self._results = sorted(self._results)

273

274 def best(self) -> Optional["CharsetMatch"]:

275 """

276 Simply return the first match. Strict equivalent to matches[0].

277 """

278 if not self._results:

279 return None

280 return self._results[0]

281

282 def first(self) -> Optional["CharsetMatch"]:

283 """

284 Redundant method, call the method best(). Kept for BC reasons.

285 """

286 return self.best()

287

288

289CoherenceMatch = Tuple[str, float]

290CoherenceMatches = List[CoherenceMatch]

291

292

293class CliDetectionResult:

294 def __init__(

295 self,

296 path: str,

297 encoding: Optional[str],

298 encoding_aliases: List[str],

299 alternative_encodings: List[str],

300 language: str,

301 alphabets: List[str],

302 has_sig_or_bom: bool,

303 chaos: float,

304 coherence: float,

305 unicode_path: Optional[str],

306 is_preferred: bool,

307 ):

308 self.path: str = path

309 self.unicode_path: Optional[str] = unicode_path

310 self.encoding: Optional[str] = encoding

311 self.encoding_aliases: List[str] = encoding_aliases

312 self.alternative_encodings: List[str] = alternative_encodings

313 self.language: str = language

314 self.alphabets: List[str] = alphabets

315 self.has_sig_or_bom: bool = has_sig_or_bom

316 self.chaos: float = chaos

317 self.coherence: float = coherence

318 self.is_preferred: bool = is_preferred

319

320 @property

321 def __dict__(self) -> Dict[str, Any]: # type: ignore

322 return {

323 "path": self.path,

324 "encoding": self.encoding,

325 "encoding_aliases": self.encoding_aliases,

326 "alternative_encodings": self.alternative_encodings,

327 "language": self.language,

328 "alphabets": self.alphabets,

329 "has_sig_or_bom": self.has_sig_or_bom,

330 "chaos": self.chaos,

331 "coherence": self.coherence,

332 "unicode_path": self.unicode_path,

333 "is_preferred": self.is_preferred,

334 }

335

336 def to_json(self) -> str:

337 return dumps(self.__dict__, ensure_ascii=True, indent=4)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/charset_normalizer/models.py: 37%

174 statements