Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/chardet/mbcssm.py: 98%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is mozilla.org code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10# Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, see
24# <https://www.gnu.org/licenses/>.
25######################### END LICENSE BLOCK #########################
27from .codingstatemachinedict import CodingStateMachineDict
28from .enums import MachineState
30# BIG5
32# fmt: off
33BIG5_CLS = (
34 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07 #allow 0x00 as legal value
35 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
36 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
37 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
38 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
39 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
40 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
41 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
42 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
43 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
44 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
45 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
46 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
47 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
48 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
49 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
50 4, 4, 4, 4, 4, 4, 4, 4, # 80 - 87
51 4, 4, 4, 4, 4, 4, 4, 4, # 88 - 8f
52 4, 4, 4, 4, 4, 4, 4, 4, # 90 - 97
53 4, 4, 4, 4, 4, 4, 4, 4, # 98 - 9f
54 4, 3, 3, 3, 3, 3, 3, 3, # a0 - a7
55 3, 3, 3, 3, 3, 3, 3, 3, # a8 - af
56 3, 3, 3, 3, 3, 3, 3, 3, # b0 - b7
57 3, 3, 3, 3, 3, 3, 3, 3, # b8 - bf
58 3, 3, 3, 3, 3, 3, 3, 3, # c0 - c7
59 3, 3, 3, 3, 3, 3, 3, 3, # c8 - cf
60 3, 3, 3, 3, 3, 3, 3, 3, # d0 - d7
61 3, 3, 3, 3, 3, 3, 3, 3, # d8 - df
62 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
63 3, 3, 3, 3, 3, 3, 3, 3, # e8 - ef
64 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
65 3, 3, 3, 3, 3, 3, 3, 0 # f8 - ff
66)
68BIG5_ST = (
69 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
70 MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,#08-0f
71 MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START,MachineState.START#10-17
72)
73# fmt: on
75BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0)
77BIG5_SM_MODEL: CodingStateMachineDict = {
78 "class_table": BIG5_CLS,
79 "class_factor": 5,
80 "state_table": BIG5_ST,
81 "char_len_table": BIG5_CHAR_LEN_TABLE,
82 "name": "Big5",
83}
85# CP949
86# fmt: off
88"""
89# Classes
900: Unused
911: 00-40, 5B-60, 7B-7F : Ascii
922: C7-FD
933: C9,FE : User-Defined Area
944: 41-52
955: 53-5A, 61-7A
966: 81-A0
977: A1-AC, B0-C5
988: AD-AF
999: C6
101# Byte 1
102Ascii: 00-7F : 1 + 4 + 5
103State 3: 81-AC, B0-C5 : 6 + 7
104State 4: AD-AF : 8
105State 5: C6 : 9
106State 6: C7-FE : 2 (+ 3)
109# Byte 2
110State 3: 41-5A, 61-7A, 81-FE : 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
111State 4: 41-5A, 61-7A, 81-A0 : 4 + 5 + 6
112State 5: 41-52, A1-FE : 2 + 3 + 4 + 7 + 8 + 9
113State 6: A1-FE : 2 + 3 + 7 + 8 + 9
114"""
116CP949_CLS = (
117 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, # 00 - 0f
118 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, # 10 - 1f
119 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 2f
120 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 3f
121 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 4f
122 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 50 - 5f
123 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, # 60 - 6f
124 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 1, 1, # 70 - 7f
125 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 80 - 8f
126 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 9f
127 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, # a0 - af
128 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0 - bf
129 7, 7, 7, 7, 7, 7, 9, 2, 2, 3, 2, 2, 2, 2, 2, 2, # c0 - cf
130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0 - df
131 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # e0 - ef
132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, # f0 - ff
133)
135CP949_ST = (
136 # 0 1 2 3 4 5 6 7 8 9
137 MachineState.ERROR, MachineState.START, 6, MachineState.ERROR, MachineState.START, MachineState.START, 3, 3, 4, 5, # START
138 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # ERROR
139 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # ITSME
140 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, MachineState.START, # 3
141 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, # 4
142 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, # 5
143 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, # 6
144)
145# fmt: on
147CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 2, 2)
149CP949_SM_MODEL: CodingStateMachineDict = {
150 "class_table": CP949_CLS,
151 "class_factor": 10,
152 "state_table": CP949_ST,
153 "char_len_table": CP949_CHAR_LEN_TABLE,
154 "name": "CP949",
155}
157# EUC-JP
158# fmt: off
159EUCJP_CLS = (
160 4, 4, 4, 4, 4, 4, 4, 4, # 00 - 07
161 4, 4, 4, 4, 4, 4, 5, 5, # 08 - 0f
162 4, 4, 4, 4, 4, 4, 4, 4, # 10 - 17
163 4, 4, 4, 5, 4, 4, 4, 4, # 18 - 1f
164 4, 4, 4, 4, 4, 4, 4, 4, # 20 - 27
165 4, 4, 4, 4, 4, 4, 4, 4, # 28 - 2f
166 4, 4, 4, 4, 4, 4, 4, 4, # 30 - 37
167 4, 4, 4, 4, 4, 4, 4, 4, # 38 - 3f
168 4, 4, 4, 4, 4, 4, 4, 4, # 40 - 47
169 4, 4, 4, 4, 4, 4, 4, 4, # 48 - 4f
170 4, 4, 4, 4, 4, 4, 4, 4, # 50 - 57
171 4, 4, 4, 4, 4, 4, 4, 4, # 58 - 5f
172 4, 4, 4, 4, 4, 4, 4, 4, # 60 - 67
173 4, 4, 4, 4, 4, 4, 4, 4, # 68 - 6f
174 4, 4, 4, 4, 4, 4, 4, 4, # 70 - 77
175 4, 4, 4, 4, 4, 4, 4, 4, # 78 - 7f
176 5, 5, 5, 5, 5, 5, 5, 5, # 80 - 87
177 5, 5, 5, 5, 5, 5, 1, 3, # 88 - 8f
178 5, 5, 5, 5, 5, 5, 5, 5, # 90 - 97
179 5, 5, 5, 5, 5, 5, 5, 5, # 98 - 9f
180 5, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
181 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
182 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
183 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
184 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
185 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
186 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
187 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
188 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
189 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
190 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
191 0, 0, 0, 0, 0, 0, 0, 5 # f8 - ff
192)
194EUCJP_ST = (
195 3, 4, 3, 5,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
196 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
197 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#10-17
198 MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,#18-1f
199 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START#20-27
200)
201# fmt: on
203EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0)
205EUCJP_SM_MODEL: CodingStateMachineDict = {
206 "class_table": EUCJP_CLS,
207 "class_factor": 6,
208 "state_table": EUCJP_ST,
209 "char_len_table": EUCJP_CHAR_LEN_TABLE,
210 "name": "EUC-JP",
211}
213# EUC-KR
214# fmt: off
215EUCKR_CLS = (
216 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
217 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
218 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
219 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
220 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
221 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
222 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
223 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
224 1, 1, 1, 1, 1, 1, 1, 1, # 40 - 47
225 1, 1, 1, 1, 1, 1, 1, 1, # 48 - 4f
226 1, 1, 1, 1, 1, 1, 1, 1, # 50 - 57
227 1, 1, 1, 1, 1, 1, 1, 1, # 58 - 5f
228 1, 1, 1, 1, 1, 1, 1, 1, # 60 - 67
229 1, 1, 1, 1, 1, 1, 1, 1, # 68 - 6f
230 1, 1, 1, 1, 1, 1, 1, 1, # 70 - 77
231 1, 1, 1, 1, 1, 1, 1, 1, # 78 - 7f
232 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
233 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
234 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
235 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
236 0, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
237 2, 2, 2, 2, 2, 3, 3, 3, # a8 - af
238 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
239 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
240 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
241 2, 3, 2, 2, 2, 2, 2, 2, # c8 - cf
242 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
243 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
244 2, 2, 2, 2, 2, 2, 2, 2, # e0 - e7
245 2, 2, 2, 2, 2, 2, 2, 2, # e8 - ef
246 2, 2, 2, 2, 2, 2, 2, 2, # f0 - f7
247 2, 2, 2, 2, 2, 2, 2, 0 # f8 - ff
248)
250EUCKR_ST = (
251 MachineState.ERROR,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
252 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #08-0f
253)
254# fmt: on
256EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0)
258EUCKR_SM_MODEL: CodingStateMachineDict = {
259 "class_table": EUCKR_CLS,
260 "class_factor": 4,
261 "state_table": EUCKR_ST,
262 "char_len_table": EUCKR_CHAR_LEN_TABLE,
263 "name": "EUC-KR",
264}
266# JOHAB
267# fmt: off
268JOHAB_CLS = (
269 4,4,4,4,4,4,4,4, # 00 - 07
270 4,4,4,4,4,4,0,0, # 08 - 0f
271 4,4,4,4,4,4,4,4, # 10 - 17
272 4,4,4,0,4,4,4,4, # 18 - 1f
273 4,4,4,4,4,4,4,4, # 20 - 27
274 4,4,4,4,4,4,4,4, # 28 - 2f
275 4,3,3,3,3,3,3,3, # 30 - 37
276 3,3,3,3,3,3,3,3, # 38 - 3f
277 3,1,1,1,1,1,1,1, # 40 - 47
278 1,1,1,1,1,1,1,1, # 48 - 4f
279 1,1,1,1,1,1,1,1, # 50 - 57
280 1,1,1,1,1,1,1,1, # 58 - 5f
281 1,1,1,1,1,1,1,1, # 60 - 67
282 1,1,1,1,1,1,1,1, # 68 - 6f
283 1,1,1,1,1,1,1,1, # 70 - 77
284 1,1,1,1,1,1,1,2, # 78 - 7f
285 6,6,6,6,8,8,8,8, # 80 - 87
286 8,8,8,8,8,8,8,8, # 88 - 8f
287 8,7,7,7,7,7,7,7, # 90 - 97
288 7,7,7,7,7,7,7,7, # 98 - 9f
289 7,7,7,7,7,7,7,7, # a0 - a7
290 7,7,7,7,7,7,7,7, # a8 - af
291 7,7,7,7,7,7,7,7, # b0 - b7
292 7,7,7,7,7,7,7,7, # b8 - bf
293 7,7,7,7,7,7,7,7, # c0 - c7
294 7,7,7,7,7,7,7,7, # c8 - cf
295 7,7,7,7,5,5,5,5, # d0 - d7
296 5,9,9,9,9,9,9,5, # d8 - df
297 9,9,9,9,9,9,9,9, # e0 - e7
298 9,9,9,9,9,9,9,9, # e8 - ef
299 9,9,9,9,9,9,9,9, # f0 - f7
300 9,9,5,5,5,5,5,0 # f8 - ff
301)
303JOHAB_ST = (
304# cls = 0 1 2 3 4 5 6 7 8 9
305 MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,3 ,3 ,4 , # MachineState.START
306 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # MachineState.ITS_ME
307 MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR ,MachineState.ERROR , # MachineState.ERROR
308 MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.ERROR ,MachineState.ERROR ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START ,MachineState.START , # 3
309 MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START ,MachineState.ERROR ,MachineState.START , # 4
310)
311# fmt: on
313JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2)
315JOHAB_SM_MODEL: CodingStateMachineDict = {
316 "class_table": JOHAB_CLS,
317 "class_factor": 10,
318 "state_table": JOHAB_ST,
319 "char_len_table": JOHAB_CHAR_LEN_TABLE,
320 "name": "Johab",
321}
323# GB2312 - REMOVED
324# GB2312 is a subset of GB18030. The GB18030 state machine and prober now
325# correctly detect GB2312 content with the same confidence as the old GB2312
326# prober (both use GB2312DistributionAnalysis). The LEGACY_MAP renames
327# GB2312 → GB18030 for backward compatibility.
328# Having both probers was redundant after fixing GB18030's char_len_table.
330# GB18030
331# GB18030 is a superset of GB2312 and GBK
332# It supports:
333# - 1-byte: ASCII (0x00-0x7F)
334# - 2-byte: lead 0x81-0xFE, trail 0x40-0x7E or 0x80-0xFE (GBK/GB2312 compatible)
335# - 4-byte: 0x81-0xFE, 0x30-0x39, 0x81-0xFE, 0x30-0x39 (GB18030 extension)
336#
337# Byte classes:
338# 0: Invalid
339# 1: ASCII (0x00-0x7F)
340# 2: Digit 0x30-0x39 (can be 2nd or 4th byte in 4-byte sequence)
341# 3: Valid 2-byte trail (0x40-0x7E)
342# 4: Invalid byte 0x7F
343# 5: Valid 2-byte trail and lead for 4-byte (0x80-0xFE)
344# 6: Lead byte (0x81-0xFE) - can start 2-byte or 4-byte, or be 3rd byte in 4-byte
346# fmt: off
347GB18030_CLS = (
348 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
349 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
350 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
351 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
352 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
353 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
354 2, 2, 2, 2, 2, 2, 2, 2, # 30 - 37
355 2, 2, 1, 1, 1, 1, 1, 1, # 38 - 3f
356 3, 3, 3, 3, 3, 3, 3, 3, # 40 - 47
357 3, 3, 3, 3, 3, 3, 3, 3, # 48 - 4f
358 3, 3, 3, 3, 3, 3, 3, 3, # 50 - 57
359 3, 3, 3, 3, 3, 3, 3, 3, # 58 - 5f
360 3, 3, 3, 3, 3, 3, 3, 3, # 60 - 67
361 3, 3, 3, 3, 3, 3, 3, 3, # 68 - 6f
362 3, 3, 3, 3, 3, 3, 3, 3, # 70 - 77
363 3, 3, 3, 3, 3, 3, 3, 4, # 78 - 7f
364 5, 6, 6, 6, 6, 6, 6, 6, # 80 - 87 0x80 can be trail byte (class 5)
365 6, 6, 6, 6, 6, 6, 6, 6, # 88 - 8f
366 6, 6, 6, 6, 6, 6, 6, 6, # 90 - 97
367 6, 6, 6, 6, 6, 6, 6, 6, # 98 - 9f
368 6, 6, 6, 6, 6, 6, 6, 6, # a0 - a7
369 6, 6, 6, 6, 6, 6, 6, 6, # a8 - af
370 6, 6, 6, 6, 6, 6, 6, 6, # b0 - b7
371 6, 6, 6, 6, 6, 6, 6, 6, # b8 - bf
372 6, 6, 6, 6, 6, 6, 6, 6, # c0 - c7
373 6, 6, 6, 6, 6, 6, 6, 6, # c8 - cf
374 6, 6, 6, 6, 6, 6, 6, 6, # d0 - d7
375 6, 6, 6, 6, 6, 6, 6, 6, # d8 - df
376 6, 6, 6, 6, 6, 6, 6, 6, # e0 - e7
377 6, 6, 6, 6, 6, 6, 6, 6, # e8 - ef
378 6, 6, 6, 6, 6, 6, 6, 6, # f0 - f7
379 6, 6, 6, 6, 6, 6, 6, 0 # f8 - ff 0xFF is invalid
380)
382# States:
383# START (0): Initial state
384# ERROR (1): Error state
385# ITS_ME (2): Definitive match
386# FIRST (3): After receiving lead byte (0x81-0xFE)
387# SECOND_4BYTE (4): After digit as 2nd byte in potential 4-byte sequence
388# THIRD_4BYTE (5): After 3rd byte (0x81-0xFE) in 4-byte sequence
389GB18030_ST = (
390# cls: 0 1 2 3 4 5 6
391 MachineState.ERROR, MachineState.START, MachineState.START, MachineState.START, MachineState.START,MachineState.ERROR, 3, # START (0)
392 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # ERROR (1)
393 MachineState.ITS_ME, MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # ITS_ME (2)
394 MachineState.ERROR, MachineState.ERROR, 4,MachineState.START, MachineState.ERROR,MachineState.START,MachineState.START, # FIRST (3): 0x81-0xFE completes 2-byte
395 MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR, 5, # SECOND_4BYTE (4): after 2nd digit
396 MachineState.ERROR, MachineState.ERROR, MachineState.START, MachineState.ERROR, MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # THIRD_4BYTE (5): after 3rd byte
397)
398# fmt: on
400# Character length table for distribution analysis
401# Class 6 (lead byte) is marked as 2 bytes since that's the most common case
402# (2-byte GB2312/GBK sequences). 4-byte sequences will be detected by the state
403# machine but won't contribute to character distribution analysis.
404GB18030_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 2, 2)
406GB18030_SM_MODEL: CodingStateMachineDict = {
407 "class_table": GB18030_CLS,
408 "class_factor": 7,
409 "state_table": GB18030_ST,
410 "char_len_table": GB18030_CHAR_LEN_TABLE,
411 "name": "GB18030",
412}
414# Shift_JIS
415# fmt: off
416SJIS_CLS = (
417 1, 1, 1, 1, 1, 1, 1, 1, # 00 - 07
418 1, 1, 1, 1, 1, 1, 0, 0, # 08 - 0f
419 1, 1, 1, 1, 1, 1, 1, 1, # 10 - 17
420 1, 1, 1, 0, 1, 1, 1, 1, # 18 - 1f
421 1, 1, 1, 1, 1, 1, 1, 1, # 20 - 27
422 1, 1, 1, 1, 1, 1, 1, 1, # 28 - 2f
423 1, 1, 1, 1, 1, 1, 1, 1, # 30 - 37
424 1, 1, 1, 1, 1, 1, 1, 1, # 38 - 3f
425 2, 2, 2, 2, 2, 2, 2, 2, # 40 - 47
426 2, 2, 2, 2, 2, 2, 2, 2, # 48 - 4f
427 2, 2, 2, 2, 2, 2, 2, 2, # 50 - 57
428 2, 2, 2, 2, 2, 2, 2, 2, # 58 - 5f
429 2, 2, 2, 2, 2, 2, 2, 2, # 60 - 67
430 2, 2, 2, 2, 2, 2, 2, 2, # 68 - 6f
431 2, 2, 2, 2, 2, 2, 2, 2, # 70 - 77
432 2, 2, 2, 2, 2, 2, 2, 1, # 78 - 7f
433 3, 3, 3, 3, 3, 2, 2, 3, # 80 - 87
434 3, 3, 3, 3, 3, 3, 3, 3, # 88 - 8f
435 3, 3, 3, 3, 3, 3, 3, 3, # 90 - 97
436 3, 3, 3, 3, 3, 3, 3, 3, # 98 - 9f
437 #0xa0 is illegal in sjis encoding, but some pages does
438 #contain such byte. We need to be more error forgiven.
439 2, 2, 2, 2, 2, 2, 2, 2, # a0 - a7
440 2, 2, 2, 2, 2, 2, 2, 2, # a8 - af
441 2, 2, 2, 2, 2, 2, 2, 2, # b0 - b7
442 2, 2, 2, 2, 2, 2, 2, 2, # b8 - bf
443 2, 2, 2, 2, 2, 2, 2, 2, # c0 - c7
444 2, 2, 2, 2, 2, 2, 2, 2, # c8 - cf
445 2, 2, 2, 2, 2, 2, 2, 2, # d0 - d7
446 2, 2, 2, 2, 2, 2, 2, 2, # d8 - df
447 3, 3, 3, 3, 3, 3, 3, 3, # e0 - e7
448 3, 3, 3, 3, 3, 4, 4, 4, # e8 - ef
449 3, 3, 3, 3, 3, 3, 3, 3, # f0 - f7
450 3, 3, 3, 3, 3, 0, 0, 0, # f8 - ff
451)
453SJIS_ST = (
454 MachineState.ERROR,MachineState.START,MachineState.START, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,#00-07
455 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
456 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START,MachineState.START,MachineState.START #10-17
457)
458# fmt: on
460SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0)
462SJIS_SM_MODEL: CodingStateMachineDict = {
463 "class_table": SJIS_CLS,
464 "class_factor": 6,
465 "state_table": SJIS_ST,
466 "char_len_table": SJIS_CHAR_LEN_TABLE,
467 "name": "Shift_JIS",
468}
470# UCS2-BE
471# fmt: off
472UCS2BE_CLS = (
473 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
474 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
475 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
476 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
477 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
478 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
479 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
480 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
481 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
482 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
483 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
484 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
485 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
486 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
487 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
488 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
489 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
490 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
491 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
492 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
493 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
494 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
495 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
496 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
497 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
498 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
499 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
500 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
501 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
502 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
503 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
504 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
505)
507UCS2BE_ST = (
508 5, 7, 7,MachineState.ERROR, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
509 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
510 MachineState.ITS_ME,MachineState.ITS_ME, 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,#10-17
511 6, 6, 6, 6, 6,MachineState.ITS_ME, 6, 6,#18-1f
512 6, 6, 6, 6, 5, 7, 7,MachineState.ERROR,#20-27
513 5, 8, 6, 6,MachineState.ERROR, 6, 6, 6,#28-2f
514 6, 6, 6, 6,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.START #30-37
515)
516# fmt: on
518UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2)
520UCS2BE_SM_MODEL: CodingStateMachineDict = {
521 "class_table": UCS2BE_CLS,
522 "class_factor": 6,
523 "state_table": UCS2BE_ST,
524 "char_len_table": UCS2BE_CHAR_LEN_TABLE,
525 "name": "UTF-16BE",
526}
528# UCS2-LE
529# fmt: off
530UCS2LE_CLS = (
531 0, 0, 0, 0, 0, 0, 0, 0, # 00 - 07
532 0, 0, 1, 0, 0, 2, 0, 0, # 08 - 0f
533 0, 0, 0, 0, 0, 0, 0, 0, # 10 - 17
534 0, 0, 0, 3, 0, 0, 0, 0, # 18 - 1f
535 0, 0, 0, 0, 0, 0, 0, 0, # 20 - 27
536 0, 3, 3, 3, 3, 3, 0, 0, # 28 - 2f
537 0, 0, 0, 0, 0, 0, 0, 0, # 30 - 37
538 0, 0, 0, 0, 0, 0, 0, 0, # 38 - 3f
539 0, 0, 0, 0, 0, 0, 0, 0, # 40 - 47
540 0, 0, 0, 0, 0, 0, 0, 0, # 48 - 4f
541 0, 0, 0, 0, 0, 0, 0, 0, # 50 - 57
542 0, 0, 0, 0, 0, 0, 0, 0, # 58 - 5f
543 0, 0, 0, 0, 0, 0, 0, 0, # 60 - 67
544 0, 0, 0, 0, 0, 0, 0, 0, # 68 - 6f
545 0, 0, 0, 0, 0, 0, 0, 0, # 70 - 77
546 0, 0, 0, 0, 0, 0, 0, 0, # 78 - 7f
547 0, 0, 0, 0, 0, 0, 0, 0, # 80 - 87
548 0, 0, 0, 0, 0, 0, 0, 0, # 88 - 8f
549 0, 0, 0, 0, 0, 0, 0, 0, # 90 - 97
550 0, 0, 0, 0, 0, 0, 0, 0, # 98 - 9f
551 0, 0, 0, 0, 0, 0, 0, 0, # a0 - a7
552 0, 0, 0, 0, 0, 0, 0, 0, # a8 - af
553 0, 0, 0, 0, 0, 0, 0, 0, # b0 - b7
554 0, 0, 0, 0, 0, 0, 0, 0, # b8 - bf
555 0, 0, 0, 0, 0, 0, 0, 0, # c0 - c7
556 0, 0, 0, 0, 0, 0, 0, 0, # c8 - cf
557 0, 0, 0, 0, 0, 0, 0, 0, # d0 - d7
558 0, 0, 0, 0, 0, 0, 0, 0, # d8 - df
559 0, 0, 0, 0, 0, 0, 0, 0, # e0 - e7
560 0, 0, 0, 0, 0, 0, 0, 0, # e8 - ef
561 0, 0, 0, 0, 0, 0, 0, 0, # f0 - f7
562 0, 0, 0, 0, 0, 0, 4, 5 # f8 - ff
563)
565UCS2LE_ST = (
566 6, 6, 7, 6, 4, 3,MachineState.ERROR,MachineState.ERROR,#00-07
567 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,#08-0f
568 MachineState.ITS_ME,MachineState.ITS_ME, 5, 5, 5,MachineState.ERROR,MachineState.ITS_ME,MachineState.ERROR,#10-17
569 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR, 6, 6,#18-1f
570 7, 6, 8, 8, 5, 5, 5,MachineState.ERROR,#20-27
571 5, 5, 5,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 5,#28-2f
572 5, 5, 5,MachineState.ERROR, 5,MachineState.ERROR,MachineState.START,MachineState.START #30-37
573)
574# fmt: on
576UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2)
578UCS2LE_SM_MODEL: CodingStateMachineDict = {
579 "class_table": UCS2LE_CLS,
580 "class_factor": 6,
581 "state_table": UCS2LE_ST,
582 "char_len_table": UCS2LE_CHAR_LEN_TABLE,
583 "name": "UTF-16LE",
584}
586# UTF-8
587# Adapted from Björn Höhrmann's DFA UTF-8 decoder
588# See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
589# Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
590# fmt: off
591UTF8_CLS = (
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00-0f
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 10-1f
594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20-2f
595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 30-3f
596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40-4f
597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 50-5f
598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60-6f
599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 70-7f
600 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 80-8f
601 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, # 90-9f
602 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # a0-af
603 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # b0-bf
604 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # c0-cf
605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # d0-df
606 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, # e0-ef
607 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, # f0-ff
608)
610# Höhrmann's DFA has states 0,12,24,36,48,60,72,84,96 which we map to states 0-8
611# State 0=ACCEPT (START), State 1=REJECT (ERROR), States 2-8 are intermediate
612UTF8_ST = (
613 MachineState.START,MachineState.ERROR, 3, 4, 6, 9, 8,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 5, 7, # state 0 (START)
614 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 1 (ERROR)
615 MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME,MachineState.ITS_ME, # state 2 (ITS_ME)
616 MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.START,MachineState.ERROR,MachineState.ERROR, # state 3
617 MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR, # state 4
618 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 5
619 MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 3,MachineState.ERROR,MachineState.ERROR, # state 6
620 MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR, # state 7
621 MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, 4,MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR, # state 8
622 MachineState.ERROR, 4,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR,MachineState.ERROR, # state 9
623)
624# fmt: on
626UTF8_CHAR_LEN_TABLE = (1, 1, 2, 3, 3, 4, 4, 1, 1, 1, 3, 4)
628UTF8_SM_MODEL: CodingStateMachineDict = {
629 "class_table": UTF8_CLS,
630 "class_factor": 12,
631 "state_table": UTF8_ST,
632 "char_len_table": UTF8_CHAR_LEN_TABLE,
633 "name": "UTF-8",
634}