1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is Mozilla Communicator client code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, write to the Free Software
24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25# 02110-1301 USA
26######################### END LICENSE BLOCK #########################
27
28from typing import Tuple, Union
29
30from .big5freq import (
31 BIG5_CHAR_TO_FREQ_ORDER,
32 BIG5_TABLE_SIZE,
33 BIG5_TYPICAL_DISTRIBUTION_RATIO,
34)
35from .euckrfreq import (
36 EUCKR_CHAR_TO_FREQ_ORDER,
37 EUCKR_TABLE_SIZE,
38 EUCKR_TYPICAL_DISTRIBUTION_RATIO,
39)
40from .euctwfreq import (
41 EUCTW_CHAR_TO_FREQ_ORDER,
42 EUCTW_TABLE_SIZE,
43 EUCTW_TYPICAL_DISTRIBUTION_RATIO,
44)
45from .gb2312freq import (
46 GB2312_CHAR_TO_FREQ_ORDER,
47 GB2312_TABLE_SIZE,
48 GB2312_TYPICAL_DISTRIBUTION_RATIO,
49)
50from .jisfreq import (
51 JIS_CHAR_TO_FREQ_ORDER,
52 JIS_TABLE_SIZE,
53 JIS_TYPICAL_DISTRIBUTION_RATIO,
54)
55from .johabfreq import JOHAB_TO_EUCKR_ORDER_TABLE
56
57
58class CharDistributionAnalysis:
59 ENOUGH_DATA_THRESHOLD = 1024
60 SURE_YES = 0.99
61 SURE_NO = 0.01
62 MINIMUM_DATA_THRESHOLD = 3
63
64 def __init__(self) -> None:
65 # Mapping table to get frequency order from char order (get from
66 # GetOrder())
67 self._char_to_freq_order: Tuple[int, ...] = tuple()
68 self._table_size = 0 # Size of above table
69 # This is a constant value which varies from language to language,
70 # used in calculating confidence. See
71 # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
72 # for further detail.
73 self.typical_distribution_ratio = 0.0
74 self._done = False
75 self._total_chars = 0
76 self._freq_chars = 0
77 self.reset()
78
79 def reset(self) -> None:
80 """reset analyser, clear any state"""
81 # If this flag is set to True, detection is done and conclusion has
82 # been made
83 self._done = False
84 self._total_chars = 0 # Total characters encountered
85 # The number of characters whose frequency order is less than 512
86 self._freq_chars = 0
87
88 def feed(self, char: Union[bytes, bytearray], char_len: int) -> None:
89 """feed a character with known length"""
90 if char_len == 2:
91 # we only care about 2-bytes character in our distribution analysis
92 order = self.get_order(char)
93 else:
94 order = -1
95 if order >= 0:
96 self._total_chars += 1
97 # order is valid
98 if order < self._table_size:
99 if 512 > self._char_to_freq_order[order]:
100 self._freq_chars += 1
101
102 def get_confidence(self) -> float:
103 """return confidence based on existing data"""
104 # if we didn't receive any character in our consideration range,
105 # return negative answer
106 if self._total_chars <= 0 or self._freq_chars <= self.MINIMUM_DATA_THRESHOLD:
107 return self.SURE_NO
108
109 if self._total_chars != self._freq_chars:
110 r = self._freq_chars / (
111 (self._total_chars - self._freq_chars) * self.typical_distribution_ratio
112 )
113 if r < self.SURE_YES:
114 return r
115
116 # normalize confidence (we don't want to be 100% sure)
117 return self.SURE_YES
118
119 def got_enough_data(self) -> bool:
120 # It is not necessary to receive all data to draw conclusion.
121 # For charset detection, certain amount of data is enough
122 return self._total_chars > self.ENOUGH_DATA_THRESHOLD
123
124 def get_order(self, _: Union[bytes, bytearray]) -> int:
125 # We do not handle characters based on the original encoding string,
126 # but convert this encoding string to a number, here called order.
127 # This allows multiple encodings of a language to share one frequency
128 # table.
129 return -1
130
131
132class EUCTWDistributionAnalysis(CharDistributionAnalysis):
133 def __init__(self) -> None:
134 super().__init__()
135 self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER
136 self._table_size = EUCTW_TABLE_SIZE
137 self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
138
139 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
140 # for euc-TW encoding, we are interested
141 # first byte range: 0xc4 -- 0xfe
142 # second byte range: 0xa1 -- 0xfe
143 # no validation needed here. State machine has done that
144 first_char = byte_str[0]
145 if first_char >= 0xC4:
146 return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
147 return -1
148
149
150class EUCKRDistributionAnalysis(CharDistributionAnalysis):
151 def __init__(self) -> None:
152 super().__init__()
153 self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
154 self._table_size = EUCKR_TABLE_SIZE
155 self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
156
157 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
158 # for euc-KR encoding, we are interested
159 # first byte range: 0xb0 -- 0xfe
160 # second byte range: 0xa1 -- 0xfe
161 # no validation needed here. State machine has done that
162 first_char = byte_str[0]
163 if first_char >= 0xB0:
164 return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
165 return -1
166
167
168class JOHABDistributionAnalysis(CharDistributionAnalysis):
169 def __init__(self) -> None:
170 super().__init__()
171 self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER
172 self._table_size = EUCKR_TABLE_SIZE
173 self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
174
175 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
176 first_char = byte_str[0]
177 if 0x88 <= first_char < 0xD4:
178 code = first_char * 256 + byte_str[1]
179 return JOHAB_TO_EUCKR_ORDER_TABLE.get(code, -1)
180 return -1
181
182
183class GB2312DistributionAnalysis(CharDistributionAnalysis):
184 def __init__(self) -> None:
185 super().__init__()
186 self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER
187 self._table_size = GB2312_TABLE_SIZE
188 self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO
189
190 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
191 # for GB2312 encoding, we are interested
192 # first byte range: 0xb0 -- 0xfe
193 # second byte range: 0xa1 -- 0xfe
194 # no validation needed here. State machine has done that
195 first_char, second_char = byte_str[0], byte_str[1]
196 if (first_char >= 0xB0) and (second_char >= 0xA1):
197 return 94 * (first_char - 0xB0) + second_char - 0xA1
198 return -1
199
200
201class Big5DistributionAnalysis(CharDistributionAnalysis):
202 def __init__(self) -> None:
203 super().__init__()
204 self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER
205 self._table_size = BIG5_TABLE_SIZE
206 self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO
207
208 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
209 # for big5 encoding, we are interested
210 # first byte range: 0xa4 -- 0xfe
211 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
212 # no validation needed here. State machine has done that
213 first_char, second_char = byte_str[0], byte_str[1]
214 if first_char >= 0xA4:
215 if second_char >= 0xA1:
216 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
217 return 157 * (first_char - 0xA4) + second_char - 0x40
218 return -1
219
220
221class SJISDistributionAnalysis(CharDistributionAnalysis):
222 def __init__(self) -> None:
223 super().__init__()
224 self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
225 self._table_size = JIS_TABLE_SIZE
226 self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
227
228 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
229 # for sjis encoding, we are interested
230 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
231 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
232 # no validation needed here. State machine has done that
233 first_char, second_char = byte_str[0], byte_str[1]
234 if 0x81 <= first_char <= 0x9F:
235 order = 188 * (first_char - 0x81)
236 elif 0xE0 <= first_char <= 0xEF:
237 order = 188 * (first_char - 0xE0 + 31)
238 else:
239 return -1
240 order = order + second_char - 0x40
241 if second_char > 0x7F:
242 order = -1
243 return order
244
245
246class EUCJPDistributionAnalysis(CharDistributionAnalysis):
247 def __init__(self) -> None:
248 super().__init__()
249 self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER
250 self._table_size = JIS_TABLE_SIZE
251 self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO
252
253 def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # type: ignore[reportIncompatibleMethodOverride]
254 # for euc-JP encoding, we are interested
255 # first byte range: 0xa0 -- 0xfe
256 # second byte range: 0xa1 -- 0xfe
257 # no validation needed here. State machine has done that
258 char = byte_str[0]
259 if char >= 0xA0:
260 return 94 * (char - 0xA1) + byte_str[1] - 0xA1
261 return -1