Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/__init_

1"""Universal character encoding detector — MIT-licensed rewrite."""

3from __future__ import annotations

5from chardet._utils import (

6 _DEFAULT_CHUNK_SIZE,

7 DEFAULT_MAX_BYTES,

8 MINIMUM_THRESHOLD,

9 _resolve_prefer_superset,

10 _validate_max_bytes,

11 _warn_deprecated_chunk_size,

12)

13from chardet._version import __version__

14from chardet.detector import UniversalDetector

15from chardet.enums import EncodingEra, LanguageFilter

16from chardet.equivalences import apply_compat_names, apply_preferred_superset

17from chardet.pipeline import DetectionDict, DetectionResult

18from chardet.pipeline.orchestrator import run_pipeline

20__all__ = [

21 "DEFAULT_MAX_BYTES",

22 "MINIMUM_THRESHOLD",

23 "DetectionDict",

24 "DetectionResult",

25 "EncodingEra",

26 "LanguageFilter",

27 "UniversalDetector",

28 "__version__",

29 "detect",

30 "detect_all",

31]

34def detect( # noqa: PLR0913

35 byte_str: bytes | bytearray,

36 should_rename_legacy: bool = False,

37 encoding_era: EncodingEra = EncodingEra.ALL,

38 chunk_size: int = _DEFAULT_CHUNK_SIZE,

39 max_bytes: int = DEFAULT_MAX_BYTES,

40 *,

41 prefer_superset: bool = False,

42 compat_names: bool = True,

43) -> DetectionDict:

44 """Detect the encoding of the given byte string.

46 :param byte_str: The byte sequence to detect encoding for.

47 :param should_rename_legacy: Deprecated alias for *prefer_superset*.

48 :param encoding_era: Restrict candidate encodings to the given era.

49 :param chunk_size: Deprecated -- accepted for backward compatibility but

50 has no effect.

51 :param max_bytes: Maximum number of bytes to examine from *byte_str*.

52 :param prefer_superset: If ``True``, remap ISO subset encodings to their

53 Windows/CP superset equivalents (e.g., ISO-8859-1 -> Windows-1252).

54 :param compat_names: If ``True`` (default), return encoding names

55 compatible with chardet 5.x/6.x. If ``False``, return raw Python

56 codec names.

57 :returns: A dictionary with keys ``"encoding"``, ``"confidence"``, and

58 ``"language"``.

59 """

60 _warn_deprecated_chunk_size(chunk_size)

61 _validate_max_bytes(max_bytes)

62 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)

63 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)

64 results = run_pipeline(data, encoding_era, max_bytes=max_bytes)

65 result = results[0].to_dict()

66 if prefer_superset:

67 apply_preferred_superset(result)

68 if compat_names:

69 apply_compat_names(result)

70 return result

73def detect_all( # noqa: PLR0913

74 byte_str: bytes | bytearray,

75 ignore_threshold: bool = False,

76 should_rename_legacy: bool = False,

77 encoding_era: EncodingEra = EncodingEra.ALL,

78 chunk_size: int = _DEFAULT_CHUNK_SIZE,

79 max_bytes: int = DEFAULT_MAX_BYTES,

80 *,

81 prefer_superset: bool = False,

82 compat_names: bool = True,

83) -> list[DetectionDict]:

84 """Detect all possible encodings of the given byte string.

86 When *ignore_threshold* is False (the default), results with confidence

87 <= MINIMUM_THRESHOLD (0.20) are filtered out. If all results are below

88 the threshold, the full unfiltered list is returned as a fallback so the

89 caller always receives at least one result.

91 :param byte_str: The byte sequence to detect encoding for.

92 :param ignore_threshold: If ``True``, return all candidate encodings

93 regardless of confidence score.

94 :param should_rename_legacy: Deprecated alias for *prefer_superset*.

95 :param encoding_era: Restrict candidate encodings to the given era.

96 :param chunk_size: Deprecated -- accepted for backward compatibility but

97 has no effect.

98 :param max_bytes: Maximum number of bytes to examine from *byte_str*.

99 :param prefer_superset: If ``True``, remap ISO subset encodings to their

100 Windows/CP superset equivalents.

101 :param compat_names: If ``True`` (default), return encoding names

102 compatible with chardet 5.x/6.x. If ``False``, return raw Python

103 codec names.

104 :returns: A list of dictionaries, sorted by descending confidence.

105 """

106 _warn_deprecated_chunk_size(chunk_size)

107 _validate_max_bytes(max_bytes)

108 prefer_superset = _resolve_prefer_superset(should_rename_legacy, prefer_superset)

109 data = byte_str if isinstance(byte_str, bytes) else bytes(byte_str)

110 results = run_pipeline(data, encoding_era, max_bytes=max_bytes)

111 dicts = [r.to_dict() for r in results]

112 if not ignore_threshold:

113 filtered = [d for d in dicts if d["confidence"] > MINIMUM_THRESHOLD]

114 if filtered:

115 dicts = filtered

116 for d in dicts:

117 if prefer_superset:

118 apply_preferred_superset(d)

119 if compat_names:

120 apply_compat_names(d)

121 return sorted(dicts, key=lambda d: d["confidence"], reverse=True)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/init.py: 56%

39 statements