Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init_

1######################## BEGIN LICENSE BLOCK ########################

2# This library is free software; you can redistribute it and/or

3# modify it under the terms of the GNU Lesser General Public

4# License as published by the Free Software Foundation; either

5# version 2.1 of the License, or (at your option) any later version.

7# This library is distributed in the hope that it will be useful,

8# but WITHOUT ANY WARRANTY; without even the implied warranty of

9# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

10# Lesser General Public License for more details.

11#

12# You should have received a copy of the GNU Lesser General Public

13# License along with this library; if not, see

14# <https://www.gnu.org/licenses/>.

15######################### END LICENSE BLOCK #########################

17from typing import Union

19from .charsetgroupprober import CharSetGroupProber

20from .charsetprober import CharSetProber

21from .enums import EncodingEra, InputState

22from .resultdict import ResultDict

23from .universaldetector import UniversalDetector

24from .version import VERSION, __version__

26__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]

29def detect(

30 byte_str: Union[bytes, bytearray],

31 should_rename_legacy: bool | None = None,

32 encoding_era: EncodingEra = EncodingEra.MODERN_WEB,

33 chunk_size: int = 65_536,

34 max_bytes: int = 200_000,

35) -> ResultDict:

36 """

37 Detect the encoding of the given byte string.

39 :param byte_str: The byte sequence to examine.

40 :type byte_str: ``bytes`` or ``bytearray``

41 :param should_rename_legacy: Should we rename legacy encodings

42 to their more modern equivalents?

43 If None (default), automatically enabled

44 when encoding_era is MODERN_WEB.

45 :type should_rename_legacy: ``bool`` or ``None``

46 :param encoding_era: Which era of encodings to consider during detection.

47 :type encoding_era: ``EncodingEra``

48 :param chunk_size: Size of chunks to process at a time

49 :type chunk_size: ``int``

50 :param max_bytes: Maximum number of bytes to examine

51 :type chunk_size: ``int``

52 """

53 if not isinstance(byte_str, bytearray):

54 if not isinstance(byte_str, bytes):

55 raise TypeError(

56 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"

57 )

58 byte_str = bytearray(byte_str)

60 # Automatically enable legacy remapping for MODERN_WEB era if not explicitly set

61 if should_rename_legacy is None:

62 should_rename_legacy = encoding_era == EncodingEra.MODERN_WEB

64 detector = UniversalDetector(

65 should_rename_legacy=should_rename_legacy, encoding_era=encoding_era

66 )

68 # Process in chunks like uchardet does

69 for i in range(0, len(byte_str), chunk_size):

70 chunk = byte_str[i : i + chunk_size]

71 detector.feed(chunk)

72 if detector.done:

73 break

75 return detector.close()

78def detect_all(

79 byte_str: Union[bytes, bytearray],

80 ignore_threshold: bool = False,

81 should_rename_legacy: bool | None = None,

82 encoding_era: EncodingEra = EncodingEra.MODERN_WEB,

83 chunk_size: int = 65_536,

84 max_bytes: int = 200_000,

85) -> list[ResultDict]:

86 """

87 Detect all the possible encodings of the given byte string.

89 :param byte_str: The byte sequence to examine.

90 :type byte_str: ``bytes`` or ``bytearray``

91 :param ignore_threshold: Include encodings that are below

92 ``UniversalDetector.MINIMUM_THRESHOLD``

93 in results.

94 :type ignore_threshold: ``bool``

95 :param should_rename_legacy: Should we rename legacy encodings

96 to their more modern equivalents?

97 If None (default), automatically enabled

98 when encoding_era is MODERN_WEB.

99 :type should_rename_legacy: ``bool`` or ``None``

100 :param encoding_era: Which era of encodings to consider during detection.

101 :type encoding_era: ``EncodingEra``

102 :param chunk_size: Size of chunks to process at a time.

103 :type chunk_size: ``int``

104 :param max_bytes: Size of chunks to process at a time.

105 :type max_bytes: ``int``

106 """

107 if not isinstance(byte_str, bytearray):

108 if not isinstance(byte_str, bytes):

109 raise TypeError(

110 f"Expected object of type bytes or bytearray, got: {type(byte_str)}"

111 )

112 byte_str = bytearray(byte_str)

113

114 # Automatically enable legacy remapping for MODERN_WEB era if not explicitly set

115 if should_rename_legacy is None:

116 should_rename_legacy = encoding_era == EncodingEra.MODERN_WEB

117

118 detector = UniversalDetector(

119 should_rename_legacy=should_rename_legacy, encoding_era=encoding_era

120 )

121

122 # Process in chunks like uchardet does

123 for i in range(0, len(byte_str), chunk_size):

124 chunk = byte_str[i : i + chunk_size]

125 detector.feed(chunk)

126 if detector.done:

127 break

128

129 detector.close()

130

131 if detector.input_state in (InputState.HIGH_BYTE, InputState.ESC_ASCII):

132 results: list[ResultDict] = []

133 probers: list[CharSetProber] = []

134 for prober in detector.charset_probers:

135 if isinstance(prober, CharSetGroupProber):

136 probers.extend(p for p in prober.probers)

137 else:

138 probers.append(prober)

139 for prober in probers:

140 # Skip probers that determined this is NOT their encoding

141 if not prober.active:

142 continue

143 if ignore_threshold or prober.get_confidence() > detector.MINIMUM_THRESHOLD:

144 charset_name = prober.charset_name or ""

145 lower_charset_name = charset_name.lower()

146 # Use Windows encoding name instead of ISO-8859 if we saw any

147 # extra Windows-specific bytes

148 if lower_charset_name.startswith("iso-8859") and detector.has_win_bytes:

149 charset_name = detector.ISO_WIN_MAP.get(

150 lower_charset_name, charset_name

151 )

152 # Rename legacy encodings with superset encodings if asked

153 if should_rename_legacy:

154 charset_name = detector.LEGACY_MAP.get(

155 charset_name.lower(), charset_name

156 )

157 results.append({

158 "encoding": charset_name,

159 "confidence": prober.get_confidence(),

160 "language": prober.language,

161 })

162 if len(results) > 0:

163 return sorted(results, key=lambda result: -result["confidence"])

164

165 return [detector.result]

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/init.py: 38%

58 statements