Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/charsetgroupprober.py: 92%

1######################## BEGIN LICENSE BLOCK ########################

2# The Original Code is Mozilla Communicator client code.

4# The Initial Developer of the Original Code is

5# Netscape Communications Corporation.

9# Contributor(s):

10# Mark Pilgrim - port to Python

11#

12# This library is free software; you can redistribute it and/or

13# modify it under the terms of the GNU Lesser General Public

14# License as published by the Free Software Foundation; either

15# version 2.1 of the License, or (at your option) any later version.

16#

17# This library is distributed in the hope that it will be useful,

18# but WITHOUT ANY WARRANTY; without even the implied warranty of

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

20# Lesser General Public License for more details.

21#

22# You should have received a copy of the GNU Lesser General Public

23# License along with this library; if not, see

24# <https://www.gnu.org/licenses/>.

25######################### END LICENSE BLOCK #########################

27from typing import Optional, Union

29from .charsetprober import CharSetProber

30from .enums import EncodingEra, LanguageFilter, ProbingState

33class CharSetGroupProber(CharSetProber):

34 def __init__(

35 self,

36 *,

37 lang_filter: LanguageFilter = LanguageFilter.ALL,

38 encoding_era: EncodingEra = EncodingEra.ALL,

39 ) -> None:

40 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era)

41 self._active_num = 0

42 self.probers: list[CharSetProber] = []

43 self._best_guess_prober: Optional[CharSetProber] = None

45 def reset(self) -> None:

46 super().reset()

47 self._active_num = 0

48 for prober in self.probers:

49 prober.reset()

50 prober.active = True

51 self._active_num += 1

52 self._best_guess_prober = None

54 @property

55 def charset_name(self) -> Optional[str]:

56 if not self._best_guess_prober:

57 self.get_confidence()

58 if not self._best_guess_prober:

59 return None

60 return self._best_guess_prober.charset_name

62 @property

63 def language(self) -> Optional[str]:

64 if not self._best_guess_prober:

65 self.get_confidence()

66 if not self._best_guess_prober:

67 return None

68 return self._best_guess_prober.language

70 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:

71 for prober in self.probers:

72 if not prober.active:

73 continue

74 state = prober.feed(byte_str)

75 if not state:

76 continue

77 if state == ProbingState.FOUND_IT:

78 self._best_guess_prober = prober

79 self._state = ProbingState.FOUND_IT

80 return self.state

81 if state == ProbingState.NOT_ME:

82 prober.active = False

83 self._active_num -= 1

84 if self._active_num <= 0:

85 self._state = ProbingState.NOT_ME

86 return self.state

87 return self.state

89 def get_confidence(self) -> float:

90 state = self.state

91 if state == ProbingState.FOUND_IT:

92 return 0.99

93 if state == ProbingState.NOT_ME:

94 return 0.01

95 best_conf = 0.0

96 self._best_guess_prober = None

97 for prober in self.probers:

98 if not prober.active:

99 self.logger.debug("%s not active", prober.charset_name)

100 continue

101 conf = prober.get_confidence()

102 self.logger.debug(

103 "%s %s confidence = %s", prober.charset_name, prober.language, conf

104 )

105 if best_conf < conf:

106 best_conf = conf

107 self._best_guess_prober = prober

108 if not self._best_guess_prober:

109 return 0.0

110 return best_conf

111

112 def _filter_probers(self, probers: list[CharSetProber]) -> list[CharSetProber]:

113 """Filter probers based on encoding era and language."""

114 filtered = []

115

116 for prober in probers:

117 # Skip meta-probers like HebrewProber that manage sub-probers

118 # They should always be included as they'll internally select appropriate sub-probers

119 if getattr(prober, "_logical_prober", None) is not None:

120 filtered.append(prober)

121 continue

122

123 # Skip sub-probers that defer naming to a meta-prober (e.g., Hebrew logical/visual)

124 # These need to stay with their parent meta-prober regardless of filtering

125 if getattr(prober, "_name_prober", None) is not None:

126 filtered.append(prober)

127 continue

128

129 # Get charset metadata for filtering

130 charset = prober.charset

131

132 # Skip probers without charset metadata

133 if charset is None:

134 filtered.append(prober)

135 continue

136

137 # Check encoding era filtering

138 if charset.encoding_era not in self.encoding_era:

139 continue

140

141 # Check language filtering

142 if charset.language_filter not in self.lang_filter:

143 continue

144

145 filtered.append(prober)

146

147 return filtered