1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Communicator client code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
26
27from typing import List, Optional, Union
28
29from .charsetprober import CharSetProber
30from .enums import LanguageFilter, ProbingState
31
32
33class CharSetGroupProber(CharSetProber):
34 def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
35 super().__init__(lang_filter=lang_filter)
36 self._active_num = 0
37 self.probers: List[CharSetProber] = []
38 self._best_guess_prober: Optional[CharSetProber] = None
39
40 def reset(self) -> None:
41 super().reset()
42 self._active_num = 0
43 for prober in self.probers:
44 prober.reset()
45 prober.active = True
46 self._active_num += 1
47 self._best_guess_prober = None
48
49 @property
50 def charset_name(self) -> Optional[str]:
51 if not self._best_guess_prober:
52 self.get_confidence()
53 if not self._best_guess_prober:
54 return None
55 return self._best_guess_prober.charset_name
56
57 @property
58 def language(self) -> Optional[str]:
59 if not self._best_guess_prober:
60 self.get_confidence()
61 if not self._best_guess_prober:
62 return None
63 return self._best_guess_prober.language
64
65 def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
66 for prober in self.probers:
67 if not prober.active:
68 continue
69 state = prober.feed(byte_str)
70 if not state:
71 continue
72 if state == ProbingState.FOUND_IT:
73 self._best_guess_prober = prober
74 self._state = ProbingState.FOUND_IT
75 return self.state
76 if state == ProbingState.NOT_ME:
77 prober.active = False
78 self._active_num -= 1
79 if self._active_num <= 0:
80 self._state = ProbingState.NOT_ME
81 return self.state
82 return self.state
83
84 def get_confidence(self) -> float:
85 state = self.state
86 if state == ProbingState.FOUND_IT:
87 return 0.99
88 if state == ProbingState.NOT_ME:
89 return 0.01
90 best_conf = 0.0
91 self._best_guess_prober = None
92 for prober in self.probers:
93 if not prober.active:
94 self.logger.debug("%s not active", prober.charset_name)
95 continue
96 conf = prober.get_confidence()
97 self.logger.debug(
98 "%s %s confidence = %s", prober.charset_name, prober.language, conf
99 )
100 if best_conf < conf:
101 best_conf = conf
102 self._best_guess_prober = prober
103 if not self._best_guess_prober:
104 return 0.0
105 return best_conf