1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Communicator client code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
26
27from typing import Optional, Union
28
29from .charsetprober import CharSetProber
30from .enums import EncodingEra, LanguageFilter, ProbingState
31
32
33class CharSetGroupProber(CharSetProber):
34 def __init__(
35 self,
36 *,
37 lang_filter: LanguageFilter = LanguageFilter.ALL,
38 encoding_era: EncodingEra = EncodingEra.ALL,
39 ) -> None:
40 super().__init__(lang_filter=lang_filter, encoding_era=encoding_era)
41 self._active_num = 0
42 self.probers: list[CharSetProber] = []
43 self._best_guess_prober: Optional[CharSetProber] = None
44
45 def reset(self) -> None:
46 super().reset()
47 self._active_num = 0
48 for prober in self.probers:
49 prober.reset()
50 prober.active = True
51 self._active_num += 1
52 self._best_guess_prober = None
53
54 @property
55 def charset_name(self) -> Optional[str]:
56 if not self._best_guess_prober:
57 self.get_confidence()
58 if not self._best_guess_prober:
59 return None
60 return self._best_guess_prober.charset_name
61
62 @property
63 def language(self) -> Optional[str]:
64 if not self._best_guess_prober:
65 self.get_confidence()
66 if not self._best_guess_prober:
67 return None
68 return self._best_guess_prober.language
69
70 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
71 for prober in self.probers:
72 if not prober.active:
73 continue
74 state = prober.feed(byte_str)
75 if not state:
76 continue
77 if state == ProbingState.FOUND_IT:
78 self._best_guess_prober = prober
79 self._state = ProbingState.FOUND_IT
80 return self.state
81 if state == ProbingState.NOT_ME:
82 prober.active = False
83 self._active_num -= 1
84 if self._active_num <= 0:
85 self._state = ProbingState.NOT_ME
86 return self.state
87 return self.state
88
89 def get_confidence(self) -> float:
90 state = self.state
91 if state == ProbingState.FOUND_IT:
92 return 0.99
93 if state == ProbingState.NOT_ME:
94 return 0.01
95 best_conf = 0.0
96 self._best_guess_prober = None
97 for prober in self.probers:
98 if not prober.active:
99 self.logger.debug("%s not active", prober.charset_name)
100 continue
101 conf = prober.get_confidence()
102 self.logger.debug(
103 "%s %s confidence = %s", prober.charset_name, prober.language, conf
104 )
105 if best_conf < conf:
106 best_conf = conf
107 self._best_guess_prober = prober
108 if not self._best_guess_prober:
109 return 0.0
110 return best_conf
111
112 def _filter_probers(self, probers: list[CharSetProber]) -> list[CharSetProber]:
113 """Filter probers based on encoding era and language."""
114 filtered = []
115
116 for prober in probers:
117 # Skip meta-probers like HebrewProber that manage sub-probers
118 # They should always be included as they'll internally select appropriate sub-probers
119 if getattr(prober, "_logical_prober", None) is not None:
120 filtered.append(prober)
121 continue
122
123 # Skip sub-probers that defer naming to a meta-prober (e.g., Hebrew logical/visual)
124 # These need to stay with their parent meta-prober regardless of filtering
125 if getattr(prober, "_name_prober", None) is not None:
126 filtered.append(prober)
127 continue
128
129 # Get charset metadata for filtering
130 charset = prober.charset
131
132 # Skip probers without charset metadata
133 if charset is None:
134 filtered.append(prober)
135 continue
136
137 # Check encoding era filtering
138 if charset.encoding_era not in self.encoding_era:
139 continue
140
141 # Check language filtering
142 if charset.language_filter not in self.lang_filter:
143 continue
144
145 filtered.append(prober)
146
147 return filtered