Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init_

1"""Universal character encoding detector — 0BSD-licensed rewrite."""

3from __future__ import annotations

5from collections.abc import Iterable

7from chardet._utils import (

8 _DEFAULT_CHUNK_SIZE,

9 DEFAULT_MAX_BYTES,

10 MINIMUM_THRESHOLD,

11 _resolve_prefer_superset,

12 _validate_max_bytes,

13 _warn_deprecated_chunk_size,

14)

15from chardet._version import __version__

16from chardet.detector import UniversalDetector

17from chardet.enums import EncodingEra, LanguageFilter

18from chardet.equivalences import apply_compat_names, apply_preferred_superset

19from chardet.pipeline import DetectionDict, DetectionResult

20from chardet.pipeline.orchestrator import run_pipeline

21from chardet.registry import _validate_encoding, normalize_encodings

23__all__ = [

24 "DEFAULT_MAX_BYTES",

25 "MINIMUM_THRESHOLD",

26 "DetectionDict",

27 "DetectionResult",

28 "EncodingEra",

29 "LanguageFilter",

30 "UniversalDetector",

31 "__version__",

32 "detect",

33 "detect_all",

34]

37def detect( # noqa: PLR0913

38 byte_str: bytes | bytearray,

39 should_rename_legacy: bool = False,

40 encoding_era: EncodingEra = EncodingEra.ALL,

41 chunk_size: int = _DEFAULT_CHUNK_SIZE,

42 max_bytes: int = DEFAULT_MAX_BYTES,

43 *,

44 prefer_superset: bool = False,

45 compat_names: bool = True,

46 include_encodings: Iterable[str] | None = None,

47 exclude_encodings: Iterable[str] | None = None,

48 no_match_encoding: str = "cp1252",

49 empty_input_encoding: str = "utf-8",

50) -> DetectionDict:

51 """Detect the encoding of the given byte string.

53 :param byte_str: The byte sequence to detect encoding for.

54 :param should_rename_legacy: Deprecated alias for *prefer_superset*.

55 :param encoding_era: Restrict candidate encodings to the given era.

56 :param chunk_size: Deprecated -- accepted for backward compatibility but

57 has no effect.

58 :param max_bytes: Maximum number of bytes to examine from *byte_str*.

59 :param prefer_superset: If ``True``, remap ISO subset encodings to their

60 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252).

61 :param compat_names: If ``True`` (default), return encoding names

62 compatible with chardet 5.x/6.x. If ``False``, return raw Python

63 codec names.

64 :param include_encodings: If given, restrict detection to only these

65 encodings (names or aliases).

66 :param exclude_encodings: If given, remove these encodings from the

67 candidate set.

68 :param no_match_encoding: Encoding to return when no candidate survives

69 the pipeline. Defaults to ``"cp1252"``.

70 :param empty_input_encoding: Encoding to return for empty input. Defaults

71 to ``"utf-8"``.

72 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and

73 ``"language"``.

74 """

75 _warn_deprecated_chunk_size(chunk_size)

76 _validate_max_bytes(max_bytes)

77 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)

78 include = normalize_encodings(include_encodings, "include_encodings")

79 exclude = normalize_encodings(exclude_encodings, "exclude_encodings")

80 no_match = _validate_encoding(no_match_encoding, "no_match_encoding")

81 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding")

82 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)

83 results = run_pipeline(

84 data,

85 encoding_era,

86 max_bytes=max_bytes,

87 include_encodings=include,

88 exclude_encodings=exclude,

89 no_match_encoding=no_match,

90 empty_input_encoding=empty,

91 )

92 result = results[0].to_dict()

93 if prefer_superset:

94 apply_preferred_superset(result)

95 if compat_names:

96 apply_compat_names(result)

97 return result

100def detect_all( # noqa: PLR0913

101 byte_str: bytes | bytearray,

102 ignore_threshold: bool = False,

103 should_rename_legacy: bool = False,

104 encoding_era: EncodingEra = EncodingEra.ALL,

105 chunk_size: int = _DEFAULT_CHUNK_SIZE,

106 max_bytes: int = DEFAULT_MAX_BYTES,

107 *,

108 prefer_superset: bool = False,

109 compat_names: bool = True,

110 include_encodings: Iterable[str] | None = None,

111 exclude_encodings: Iterable[str] | None = None,

112 no_match_encoding: str = "cp1252",

113 empty_input_encoding: str = "utf-8",

114) -> list[DetectionDict]:

115 """Detect all possible encodings of the given byte string.

116

117 When *ignore_threshold* is False (the default), results with confidence

118 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below

119 the threshold, the full unfiltered list is returned as a fallback so the

120 caller always receives at least one result.

121

122 :param byte_str: The byte sequence to detect encoding for.

123 :param ignore_threshold: If ``True``, return all candidate encodings

124 regardless of confidence score.

125 :param should_rename_legacy: Deprecated alias for *prefer_superset*.

126 :param encoding_era: Restrict candidate encodings to the given era.

127 :param chunk_size: Deprecated -- accepted for backward compatibility but

128 has no effect.

129 :param max_bytes: Maximum number of bytes to examine from *byte_str*.

130 :param prefer_superset: If ``True``, remap ISO subset encodings to their

131 Windows/CP superset equivalents.

132 :param compat_names: If ``True`` (default), return encoding names

133 compatible with chardet 5.x/6.x. If ``False``, return raw Python

134 codec names.

135 :param include_encodings: If given, restrict detection to only these

136 encodings (names or aliases).

137 :param exclude_encodings: If given, remove these encodings from the

138 candidate set.

139 :param no_match_encoding: Encoding to return when no candidate survives

140 the pipeline. Defaults to ``"cp1252"``.

141 :param empty_input_encoding: Encoding to return for empty input. Defaults

142 to ``"utf-8"``.

143 :returns: A list of dictionaries, sorted by descending confidence.

144 """

145 _warn_deprecated_chunk_size(chunk_size)

146 _validate_max_bytes(max_bytes)

147 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)

148 include = normalize_encodings(include_encodings, "include_encodings")

149 exclude = normalize_encodings(exclude_encodings, "exclude_encodings")

150 no_match = _validate_encoding(no_match_encoding, "no_match_encoding")

151 empty = _validate_encoding(empty_input_encoding, "empty_input_encoding")

152 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)

153 results = run_pipeline(

154 data,

155 encoding_era,

156 max_bytes=max_bytes,

157 include_encodings=include,

158 exclude_encodings=exclude,

159 no_match_encoding=no_match,

160 empty_input_encoding=empty,

161 )

162 dicts = [r.to_dict() for r in results]

163 if not ignore_threshold:

164 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD]

165 if filtered:

166 dicts = filtered

167 for d in dicts:

168 if prefer_superset:

169 apply_preferred_superset(d)

170 if compat_names:

171 apply_compat_names(d)

172 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/init.py: 57%

49 statements