Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/charsetgroupprober.py: 92%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

89 statements  

1######################## BEGIN LICENSE BLOCK ######################## 

2# The Original Code is Mozilla Communicator client code. 

3# 

4# The Initial Developer of the Original Code is 

5# Netscape Communications Corporation. 

6# Portions created by the Initial Developer are Copyright (C) 1998 

7# the Initial Developer. All Rights Reserved. 

8# 

9# Contributor(s): 

10# Mark Pilgrim - port to Python 

11# 

12# This library is free software; you can redistribute it and/or 

13# modify it under the terms of the GNU Lesser General Public 

14# License as published by the Free Software Foundation; either 

15# version 2.1 of the License, or (at your option) any later version. 

16# 

17# This library is distributed in the hope that it will be useful, 

18# but WITHOUT ANY WARRANTY; without even the implied warranty of 

19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

20# Lesser General Public License for more details. 

21# 

22# You should have received a copy of the GNU Lesser General Public 

23# License along with this library; if not, see 

24# <https://www.gnu.org/licenses/>. 

25######################### END LICENSE BLOCK ######################### 

26 

27from typing import Optional, Union 

28 

29from .charsetprober import CharSetProber 

30from .enums import EncodingEra, LanguageFilter, ProbingState 

31 

32 

33class CharSetGroupProber(CharSetProber): 

34 def __init__( 

35 self, 

36 *, 

37 lang_filter: LanguageFilter = LanguageFilter.ALL, 

38 encoding_era: EncodingEra = EncodingEra.ALL, 

39 ) -> None: 

40 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era) 

41 self._active_num = 0 

42 self.probers: list[CharSetProber] = [] 

43 self._best_guess_prober: Optional[CharSetProber] = None 

44 

45 def reset(self) -> None: 

46 super().reset() 

47 self._active_num = 0 

48 for prober in self.probers: 

49 prober.reset() 

50 prober.active = True 

51 self._active_num += 1 

52 self._best_guess_prober = None 

53 

54 @property 

55 def charset_name(self) -> Optional[str]: 

56 if not self._best_guess_prober: 

57 self.get_confidence() 

58 if not self._best_guess_prober: 

59 return None 

60 return self._best_guess_prober.charset_name 

61 

62 @property 

63 def language(self) -> Optional[str]: 

64 if not self._best_guess_prober: 

65 self.get_confidence() 

66 if not self._best_guess_prober: 

67 return None 

68 return self._best_guess_prober.language 

69 

70 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: 

71 for prober in self.probers: 

72 if not prober.active: 

73 continue 

74 state = prober.feed(byte_str) 

75 if not state: 

76 continue 

77 if state == ProbingState.FOUND_IT: 

78 self._best_guess_prober = prober 

79 self._state = ProbingState.FOUND_IT 

80 return self.state 

81 if state == ProbingState.NOT_ME: 

82 prober.active = False 

83 self._active_num -= 1 

84 if self._active_num <= 0: 

85 self._state = ProbingState.NOT_ME 

86 return self.state 

87 return self.state 

88 

89 def get_confidence(self) -> float: 

90 state = self.state 

91 if state == ProbingState.FOUND_IT: 

92 return 0.99 

93 if state == ProbingState.NOT_ME: 

94 return 0.01 

95 best_conf = 0.0 

96 self._best_guess_prober = None 

97 for prober in self.probers: 

98 if not prober.active: 

99 self.logger.debug("%s not active", prober.charset_name) 

100 continue 

101 conf = prober.get_confidence() 

102 self.logger.debug( 

103 "%s %s confidence = %s", prober.charset_name, prober.language, conf 

104 ) 

105 if best_conf < conf: 

106 best_conf = conf 

107 self._best_guess_prober = prober 

108 if not self._best_guess_prober: 

109 return 0.0 

110 return best_conf 

111 

112 def _filter_probers(self, probers: list[CharSetProber]) -> list[CharSetProber]: 

113 """Filter probers based on encoding era and language.""" 

114 filtered = [] 

115 

116 for prober in probers: 

117 # Skip meta-probers like HebrewProber that manage sub-probers 

118 # They should always be included as they'll internally select appropriate sub-probers 

119 if getattr(prober, "_logical_prober", None) is not None: 

120 filtered.append(prober) 

121 continue 

122 

123 # Skip sub-probers that defer naming to a meta-prober (e.g., Hebrew logical/visual) 

124 # These need to stay with their parent meta-prober regardless of filtering 

125 if getattr(prober, "_name_prober", None) is not None: 

126 filtered.append(prober) 

127 continue 

128 

129 # Get charset metadata for filtering 

130 charset = prober.charset 

131 

132 # Skip probers without charset metadata 

133 if charset is None: 

134 filtered.append(prober) 

135 continue 

136 

137 # Check encoding era filtering 

138 if charset.encoding_era not in self.encoding_era: 

139 continue 

140 

141 # Check language filtering 

142 if charset.language_filter not in self.lang_filter: 

143 continue 

144 

145 filtered.append(prober) 

146 

147 return filtered